From 7b95079dcd4ae066b736c6ea7b2d9f60ad56ad5a Mon Sep 17 00:00:00 2001 From: Bill Teng <135061747+TT-billteng@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:18:55 -0700 Subject: [PATCH] Revert "Ncvetkovic/0 bh llk test coverage dst acc" (#13784) Revert "Ncvetkovic/0 bh llk test coverage dst acc (#13293)" This reverts commit e58dd71d398cd2cac591e42840978c570c4a9a41. --- .../eltwise_copy_block_matmul_partials.cpp | 6 +- .../test_kernels/compute/reconfig.cpp | 21 +- .../test_kernels/compute/reduce_h.cpp | 43 +- .../test_kernels/compute/reduce_hw.cpp | 43 +- .../test_kernels/compute/reduce_w.cpp | 43 +- .../test_copy_block_matmul_partials.cpp | 186 +++---- .../unit_tests/compute/test_reconfig.cpp | 261 +++++---- .../unit_tests/compute/test_reduce.cpp | 365 ++++++------ .../compute/test_untilize_tilize.cpp | 224 +++----- .../compute/matmul/test_matmul_X_tile.cpp | 520 +++++++++--------- 10 files changed, 764 insertions(+), 948 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp index dcd0b9f8f8b..62e58295873 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp @@ -9,6 +9,8 @@ #include "compute_kernel_api/eltwise_unary/eltwise_unary.h" #include "compute_kernel_api.h" +#define START_IN_TILE_ID (0) +#define START_DST_TILE_ID (0) namespace NAMESPACE { void MAIN { @@ -31,9 +33,9 @@ void MAIN { cb_reserve_back(out_cb_id, num_single_transfer); // Copy num_single_transfer tiles from in_cb to DEST - copy_block_matmul_partials(in_cb_id, 0, 0, num_single_transfer); + copy_block_matmul_partials(in_cb_id, START_IN_TILE_ID, START_DST_TILE_ID, num_single_transfer); // Pack num_single_transfer tiles to out_cb - matmul_pack_tile(0, out_cb_id, num_single_transfer); + matmul_pack_tile(START_DST_TILE_ID, out_cb_id, num_single_transfer); // Release DEST reg marking compute/pack complete release_dst(); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp index c3706439495..efbb497fa8a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp @@ -2,14 +2,16 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include "compute_kernel_api/eltwise_binary.h" +#include #include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" #include "compute_kernel_api/tile_move_copy.h" #include "compute_kernel_api/pack.h" #include "compute_kernel_api/reconfig_data_format.h" +#define START_IN_TILE_ID (0) +#define START_DST_TILE_ID (0) + namespace NAMESPACE { void MAIN { uint32_t num_tiles = get_arg_val(0); @@ -18,7 +20,7 @@ void MAIN { constexpr auto cb_in0 = tt::CB::c_in0; // Bfp8_b constexpr auto cb_in1 = tt::CB::c_in1; // Bfp16_b constexpr auto cb_in2 = tt::CB::c_in2; // Bfp16_b - constexpr auto cb_out0 = tt::CB::c_out0; // Fp32 + constexpr auto cb_out0 = tt::CB::c_out0; // Bfp16_b constexpr auto cb_out1 = tt::CB::c_out1; // Bfp8_b @@ -39,17 +41,10 @@ void MAIN { // data inside CB_0, 2nd one inits it to Bfp16_b // which is inside CB_2 copy_tile_init(); - // This call will test copy_tile_to_dst_init_short as well copy_tile_to_dst_init_short_with_dt(cb_in0, cb_in2); cb_wait_front(cb_in2, ublock_size_tiles); -#if (BLOCK_COPY == 1) - for (uint32_t u_cnt = 0; u_cnt < ublock_size_tiles; u_cnt++) { - copy_tile(cb_in2, 0, 0); - } -#elif (BLOCK_COPY == 0) - copy_block_matmul_partials(cb_in2, 0, 0, ublock_size_tiles); -#endif + copy_block_matmul_partials(cb_in2, START_IN_TILE_ID, START_DST_TILE_ID, ublock_size_tiles); cb_pop_front(cb_in2, ublock_size_tiles); // -------------------- Addition with acc ----------------------------- @@ -89,7 +84,7 @@ void MAIN { pack_reconfig_l1_acc(true); #endif // Configured already for CB_16, Bfp16_b - matmul_pack_tile(0, cb_out0, ublock_size_tiles); + matmul_pack_tile(START_DST_TILE_ID, cb_out0, ublock_size_tiles); // Reconfig for CB_17, Bfp8_b, then pack to CB_17 #if (EXPLICIT_RECONFIG == 1) // Indices for old_output, new_output @@ -101,7 +96,7 @@ void MAIN { // Not testing for L1 accumulation pack_reconfig_l1_acc(false); - matmul_pack_tile(0, cb_out1, ublock_size_tiles); + matmul_pack_tile(START_DST_TILE_ID, cb_out1, ublock_size_tiles); release_dst(); cb_pop_front(cb_in0, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp index ac87cd4169d..c77a6a5ec58 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp @@ -4,40 +4,10 @@ #include -#include "compute_kernel_api/reduce.h" - -/* This dummy initialization function is called prior to reduce_init to ensure proper - * initialization of the HW and to test reduce_init_short/reduce_init_delta calls. - * - * - If SHORT_INIT is defined, this function provides API calls - * which initialize the HW properly when supplemented with reduce_init_short or - * reduce_init_delta (note that these two inits are the same except for the "at_start" - * argument; reference reduce.h for more details). - * - If SHORT_INIT is not defined, only the PACK configuration function is called with - * a negative value of the defined "at_start" template argument because full reduce_init - * provides other API calls. - * - * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0. - * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1. - * - * After dummy_init is called, the proper reduce init call will be invoked with the defined - * value of the argument, not the negated value. This will ensure that the "at_start" - * argument is tested. Reference llk_pack_reduce_config_v2 for more details. - */ -template -ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16) -{ -#ifdef SHORT_INIT - UNPACK(( llk_unpack_AB_hw_configure_disaggregated(icb, icb_scaler) )); - - MATH(( llk_math_pack_sync_init() )); - MATH(( llk_math_hw_configure_disaggregated() )); +#include "debug/dprint.h" - PACK(( llk_pack_init() )); - PACK(( llk_pack_dest_init() )); -#endif - PACK(( llk_pack_reduce_config_v2(ocb) )); -} +#include "compute_kernel_api/reduce.h" +#include "compute_kernel_api/eltwise_binary.h" namespace NAMESPACE { void MAIN { @@ -45,12 +15,11 @@ void MAIN { constexpr uint32_t Ht = get_compile_time_arg_val(0); constexpr uint32_t Wt = get_compile_time_arg_val(1); constexpr uint32_t NC = get_compile_time_arg_val(2); - constexpr bool at_start = get_compile_time_arg_val(3); - dummy_init(tt::CB::c_in0, tt::CB::c_in2); #ifndef SHORT_INIT - reduce_init(tt::CB::c_in0, tt::CB::c_in2); + reduce_init(tt::CB::c_in0, tt::CB::c_in2); #else - reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0); + reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); #endif cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp index 905c2da51e5..9e6ab5c086c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp @@ -4,40 +4,10 @@ #include -#include "compute_kernel_api/reduce.h" - -/* This dummy initialization function is called prior to reduce_init to ensure proper - * initialization of the HW and to test reduce_init_short/reduce_init_delta calls. - * - * - If SHORT_INIT is defined, this function provides API calls - * which initialize the HW properly when supplemented with reduce_init_short or - * reduce_init_delta (note that these two inits are the same except for the "at_start" - * argument; reference reduce.h for more details). - * - If SHORT_INIT is not defined, only the PACK configuration function is called with - * a negative value of the defined "at_start" template argument because full reduce_init - * provides other API calls. - * - * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0. - * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1. - * - * After dummy_init is called, the proper reduce init call will be invoked with the defined - * value of the argument, not the negated value. This will ensure that the "at_start" - * argument is tested. Reference llk_pack_reduce_config_v2 for more details. - */ -template -ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16) -{ -#ifdef SHORT_INIT - UNPACK(( llk_unpack_AB_hw_configure_disaggregated(icb, icb_scaler) )); - - MATH(( llk_math_pack_sync_init() )); - MATH(( llk_math_hw_configure_disaggregated() )); +#include "debug/dprint.h" - PACK(( llk_pack_init() )); - PACK(( llk_pack_dest_init() )); -#endif - PACK(( llk_pack_reduce_config_v2(ocb) )); -} +#include "compute_kernel_api/reduce.h" +#include "compute_kernel_api/eltwise_binary.h" namespace NAMESPACE { void MAIN { @@ -45,12 +15,11 @@ void MAIN { constexpr uint32_t Ht = get_compile_time_arg_val(0); constexpr uint32_t Wt = get_compile_time_arg_val(1); constexpr uint32_t NC = get_compile_time_arg_val(2); - constexpr bool at_start = get_compile_time_arg_val(3); - dummy_init(tt::CB::c_in0, tt::CB::c_in2); #ifndef SHORT_INIT - reduce_init(tt::CB::c_in0, tt::CB::c_in2); + reduce_init(tt::CB::c_in0, tt::CB::c_in2); #else - reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0); + reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); #endif cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp index e413429e8a3..ededadfb0b2 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp @@ -4,40 +4,10 @@ #include -#include "compute_kernel_api/reduce.h" - -/* This dummy initialization function is called prior to reduce_init to ensure proper - * initialization of the HW and to test reduce_init_short/reduce_init_delta calls. - * - * - If SHORT_INIT is defined, this function provides API calls - * which initialize the HW properly when supplemented with reduce_init_short or - * reduce_init_delta (note that these two inits are the same except for the "at_start" - * argument; reference reduce.h for more details). - * - If SHORT_INIT is not defined, only the PACK configuration function is called with - * a negative value of the defined "at_start" template argument because full reduce_init - * provides other API calls. - * - * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0. - * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1. - * - * After dummy_init is called, the proper reduce init call will be invoked with the defined - * value of the argument, not the negated value. This will ensure that the "at_start" - * argument is tested. Reference llk_pack_reduce_config_v2 for more details. - */ -template -ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16) -{ -#ifdef SHORT_INIT - UNPACK(( llk_unpack_AB_hw_configure_disaggregated(icb, icb_scaler) )); - - MATH(( llk_math_pack_sync_init() )); - MATH(( llk_math_hw_configure_disaggregated() )); +#include "debug/dprint.h" - PACK(( llk_pack_init() )); - PACK(( llk_pack_dest_init() )); -#endif - PACK(( llk_pack_reduce_config_v2(ocb) )); -} +#include "compute_kernel_api/reduce.h" +#include "compute_kernel_api/eltwise_binary.h" namespace NAMESPACE { void MAIN { @@ -45,12 +15,11 @@ void MAIN { constexpr uint32_t Ht = get_compile_time_arg_val(0); constexpr uint32_t Wt = get_compile_time_arg_val(1); constexpr uint32_t NC = get_compile_time_arg_val(2); - constexpr bool at_start = get_compile_time_arg_val(3); - dummy_init(tt::CB::c_in0, tt::CB::c_in2); #ifndef SHORT_INIT - reduce_init(tt::CB::c_in0, tt::CB::c_in2); + reduce_init(tt::CB::c_in0, tt::CB::c_in2); #else - reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); + binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0); + reduce_init_delta(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2); #endif cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp index cb9f7bbaf69..5cd7d0f4f24 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp @@ -3,30 +3,25 @@ // SPDX-License-Identifier: Apache-2.0 #include "device_fixture.hpp" -#include "tt_metal/test_utils/stimulus.hpp" - using namespace tt; -using namespace tt::test_utils; namespace unit_tests::compute::matmul_partials { struct CopyBlockMatmulPartialsConfig { - uint32_t single_tile_size = 2 * 32 * 32; - uint32_t num_tiles = 1; - // *_ublock defines no. of tiles finished with single LLK API call: - uint32_t reader_ublock = 1; - uint32_t writer_ublock = 1; - uint32_t compute_ublock = 1; - uint32_t src0_cb_index = 0; - uint32_t ouput_cb_index = 16; - // Whether or not we want the result to be stored in DST in FP32: - bool fp32_dest_acc_en = false; - // Whether or not to sync full/half DST between MATH and PACK: - bool dst_full_sync_en = false; + uint32_t single_tile_size; + uint32_t num_tiles; + uint32_t reader_ublock; + uint32_t writer_ublock; + uint32_t compute_ublock; + uint32_t src0_cb_index; + uint32_t ouput_cb_index; + bool dst_full_sync_en; }; void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const CopyBlockMatmulPartialsConfig& test_config) { + + //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// @@ -54,24 +49,14 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const uint32_t src0_cb_index = test_config.src0_cb_index; uint32_t num_input_tiles = test_config.reader_ublock; - tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) .set_page_size(src0_cb_index, single_tile_size); - - if (test_config.fp32_dest_acc_en) { - cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float32}}) - .set_page_size(src0_cb_index, single_tile_size); - } auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); uint32_t ouput_cb_index = test_config.ouput_cb_index; uint32_t num_output_tiles = test_config.writer_ublock; tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) .set_page_size(ouput_cb_index, single_tile_size); - if (test_config.fp32_dest_acc_en) { - cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float32}}) - .set_page_size(ouput_cb_index, single_tile_size); - } auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); auto unary_reader_kernel = tt_metal::CreateKernel( @@ -93,36 +78,21 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const uint(ouput_cb_index) // Output CB idx }; - std::map defines; - if (test_config.fp32_dest_acc_en) { - defines["DST_ACCUM_MODE"] = "1"; - } auto eltwise_unary_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp", core, - tt_metal::ComputeConfig{.fp32_dest_acc_en = test_config.fp32_dest_acc_en, - .dst_full_sync_en = test_config.dst_full_sync_en, - .compile_args = compute_kernel_args, - .defines = defines} + tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en, + .compile_args = compute_kernel_args} ); + //////////////////////////////////////////////////////////////////////////// // Execute Application //////////////////////////////////////////////////////////////////////////// - std::vector src_vec = create_random_vector_of_bfloat16( - dram_buffer_size, 100, 0); - - if (test_config.fp32_dest_acc_en) { - auto src_vec_float = generate_uniform_random_vector( - -100, 100, dram_buffer_size/sizeof(float), 0); - for (auto i = 0; i < src_vec.size(); i++) { - std::memcpy(&src_vec[i], &src_vec_float[i], sizeof(float)); - src_vec[i] &= 0xFFFFE000; - } - } - - tt_metal::detail::WriteToBuffer(src_dram_buffer_bf16, src_vec); + std::vector src_vec_bf16 = create_random_vector_of_bfloat16( + dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + tt_metal::detail::WriteToBuffer(src_dram_buffer_bf16, src_vec_bf16); tt_metal::SetRuntimeArgs( program, @@ -153,71 +123,97 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const std::vector result_vec_bf16; tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec_bf16); + //////////////////////////////////////////////////////////////////////////// // Validation & Teardown //////////////////////////////////////////////////////////////////////////// - EXPECT_EQ(src_vec.size(), result_vec_bf16.size()); - EXPECT_EQ(src_vec, result_vec_bf16); + EXPECT_EQ(src_vec_bf16.size(), result_vec_bf16.size()); + EXPECT_EQ(src_vec_bf16, result_vec_bf16); + + } } // namespace unit_tests::compute::matmul_partials //////////////////////////////////////////////////////////////////////////// -// Test Description +// Tests // ------------------------------------------------------------------------ -// These tests aim to cover usage of these API calls: +// These tests aim to cover usage of these calls: // - copy_block_matmul_partials // - matmul_pack_tile +// +// Tests which contain a string in RXWYCZ format in their name cover +// different scenarios in reader/writer/compute kernel usage. Letters +// R, W and C represent reader, writer and compute kernel, respectively, +// while the numbers X, Y and Z represent how many tiles will a kernel +// move in a single loop iteration. This is important because depending +// on these numbers, synchronization points are met at different places. +// Since there can be a maximum of 8 32-by-32 tiles in DEST reg when using +// half of it (for MATH/PACK sync purporses), highest bandwidth is achieved +// when all three parameters are 8. It's also possible to enforce MATH/PACK +// serialization by telling writer to wait for a single tile to be avail- +// able in output CB. +// //////////////////////////////////////////////////////////////////////////// +TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W8C8) { + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { + .single_tile_size = 2 * 1024, + .num_tiles = 8, + .reader_ublock = 8, + .writer_ublock = 8, + .compute_ublock = 8, + .src0_cb_index = 0, + .ouput_cb_index = 16, + .dst_full_sync_en = dst_full_sync_en + }; + unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); + } +} -TEST_F(DeviceFixture, ComputeCopyBlockSingle) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en); - unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { - .num_tiles = 8, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en - }; - unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); - } +TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W8C1) { + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { + .single_tile_size = 2 * 1024, + .num_tiles = 8, + .reader_ublock = 8, + .writer_ublock = 8, + .compute_ublock = 1, + .src0_cb_index = 0, + .ouput_cb_index = 16, + .dst_full_sync_en = dst_full_sync_en + }; + unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); } } -TEST_F(DeviceFixture, ComputeCopyBlockMultiple) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en); - unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { - .num_tiles = 8, - .reader_ublock = 8, - .writer_ublock = 8, - .compute_ublock = 8, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en - }; - unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); - } + +TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W1C1) { + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { + .single_tile_size = 2 * 1024, + .num_tiles = 8, + .reader_ublock = 8, + .writer_ublock = 1, + .compute_ublock = 1, + .src0_cb_index = 0, + .ouput_cb_index = 16, + .dst_full_sync_en = dst_full_sync_en + }; + unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); } } -TEST_F(DeviceFixture, ComputeCopyBlockComputeBottleneck) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en); - unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { - .num_tiles = 8, - .reader_ublock = 8, - .writer_ublock = 8, - .compute_ublock = 1, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en - }; - unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); - } +TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR1W1C1) { + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = { + .single_tile_size = 2 * 1024, + .num_tiles = 1, + .reader_ublock = 1, + .writer_ublock = 1, + .compute_ublock = 1, + .src0_cb_index = 0, + .ouput_cb_index = 16, + .dst_full_sync_en = dst_full_sync_en + }; + unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config); } } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp index 95c902c026f..9d48d09ccaa 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include "device_fixture.hpp" #include "tt_metal/common/bfloat8.hpp" #include "tt_metal/test_utils/comparison.hpp" @@ -14,58 +13,38 @@ namespace unit_tests::compute::reconfig { struct ReconfigConfig { size_t num_tiles = 0; - // Number of tiles finished with single LLK API call: size_t ublock_size_tiles = 0; - // Reconfig LLK API calls can either explicitly or implicitly take previous - // CB indices; which version of the call is used is defined by this flag: bool explicit_reconfig = false; - // Some reconfig calls are joined for SrcA/B; whether split or joined calls - // are used is defined with this flag: bool split_src_reconfig = false; - // This flag defines whether regular packing to L1 is used, or the one - // where the result is accumulated with the previous value: bool l1_acc = false; - // Whether or not we want the result to be stored in DST in FP32 and/or - // accumulated with previous DST value is controlled with this flag: - bool fp32_dest_acc_en = false; - // Whether to test with copy_tile or copy_block_matmul_partials is contro- - // lled with this flag: - bool block_copy = true; - // Whether or not to sync full/half DST between MATH and PACK: bool dst_full_sync_en = false; }; -using VariantVectorType = std::variant, std::vector>; - /// @brief Does Dramx3 --> Reader --> CB --> Add with acc --> CB --> Writer --> Dram /// @param device /// @param test_config - Configuration of the test -- see struct /// @return bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_config) { + bool pass = true; + //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - bool pass = true; uint32_t in0_id = 0; uint32_t in1_id = 1; uint32_t in2_id = 2; uint32_t out0_id = 16; uint32_t out1_id = 17; static float out0_result_old = 0; - // Since golden is not perfect, some corner cases for these values will - // make the tests fail. However, this is a representative example since - // it utilizes the full BFP16 presicion and range: - float in0_val = 1.0; - float in1_val = 127.0; - float in2_val = 0.0078125; - uint32_t single_tile_size_fp32 = 4 * 32 * 32; // Single 32x32 tile size for Float32 - uint32_t single_tile_size_bfp16b = 2 * 32 * 32; // Single 32x32 tile size for Float16_b - uint32_t single_tile_size_bfp8b = 1 * 32 * 32 + 64; // Single 32x32 tile size for Bfp8_b - uint32_t single_tile_size_out0 = test_config.fp32_dest_acc_en ? single_tile_size_fp32 : single_tile_size_bfp16b; + // Since golden is not perfect, don't change these values much + float in0_val = 1.8601; + float in1_val = 0.0003; + float in2_val = 2.03456; + uint32_t single_tile_size_bfp16b = 2 * 32 * 32; // Single 32x32 tile size for Float16_b + uint32_t single_tile_size_bfp8b = 1 * 32 * 32 + 64; // Single 32x32 tile size for Bfp8_b const size_t dram_buffer_size_bfp16b = test_config.num_tiles * single_tile_size_bfp16b; const size_t dram_buffer_size_bfp8b = test_config.num_tiles * single_tile_size_bfp8b; - const size_t dram_buffer_size_out0 = test_config.num_tiles * single_tile_size_out0; CoreCoord core = {0, 0}; tt_metal::Program program = tt_metal::CreateProgram(); @@ -76,9 +55,6 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c tt::tt_metal::InterleavedBufferConfig dram_config_bfp8b{ .device = device, .size = dram_buffer_size_bfp8b, .page_size = dram_buffer_size_bfp8b, .buffer_type = tt::tt_metal::BufferType::DRAM}; - tt::tt_metal::InterleavedBufferConfig dram_config_out0{ - .device = device, .size = dram_buffer_size_out0, .page_size = dram_buffer_size_out0, .buffer_type = tt::tt_metal::BufferType::DRAM}; - // This will be srcB in Bfp8_b auto input0_dram_buffer = CreateBuffer(dram_config_bfp8b); uint32_t input0_dram_byte_address = input0_dram_buffer->address(); @@ -94,8 +70,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c uint32_t input2_dram_byte_address = input2_dram_buffer->address(); auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates(); - // This will be Output0 in Float32 or Float16_b depending on fp32_dest_acc_en - auto output0_dram_buffer = CreateBuffer(dram_config_out0); + // This will be Output0 in Float16_b + auto output0_dram_buffer = CreateBuffer(dram_config_bfp16b); uint32_t output0_dram_byte_address = output0_dram_buffer->address(); auto output0_dram_noc_xy = output0_dram_buffer->noc_coordinates(); @@ -120,8 +96,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c auto l1_input2_cb = tt_metal::CreateCircularBuffer(program, core, l1_input2_cb_config); tt_metal::CircularBufferConfig l1_output0_cb_config = - tt_metal::CircularBufferConfig(dram_buffer_size_out0, {{out0_id, (test_config.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}}) - .set_page_size(out0_id, single_tile_size_out0); + tt_metal::CircularBufferConfig(dram_buffer_size_bfp16b, {{out0_id, tt::DataFormat::Float16_b}}) + .set_page_size(out0_id, single_tile_size_bfp16b); auto l1_output0_cb = tt_metal::CreateCircularBuffer(program, core, l1_output0_cb_config); tt_metal::CircularBufferConfig l1_output1_cb_config = @@ -132,12 +108,22 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c vector compute_kernel_args = {}; std::map defines; - - defines["DST_ACCUM_MODE"] = "1"; // Needed always in order for reader kernel to load data from CB2 - defines["EXPLICIT_RECONFIG"] = test_config.explicit_reconfig ? "1" : "0"; - defines["SPLIT_SRC_RECONFIG"] = test_config.split_src_reconfig ? "1" : "0"; - defines["BLOCK_COPY"] = test_config.block_copy ? "1" : "0"; - defines["L1_ACC"] = test_config.l1_acc ? "1" : "0"; + defines["DST_ACCUM_MODE"] = "1"; + if (test_config.explicit_reconfig) { + defines["EXPLICIT_RECONFIG"] = "1"; + } else { + defines["EXPLICIT_RECONFIG"] = "0"; + } + if (test_config.split_src_reconfig) { + defines["SPLIT_SRC_RECONFIG"] = "1"; + } else { + defines["SPLIT_SRC_RECONFIG"] = "0"; + } + if (test_config.l1_acc) { + defines["L1_ACC"] = "1"; + } else { + defines["L1_ACC"] = "0"; + } auto reader_kernel = tt_metal::CreateKernel( program, @@ -157,11 +143,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c program, "tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp", core, - tt_metal::ComputeConfig{ - .fp32_dest_acc_en = test_config.fp32_dest_acc_en, - .dst_full_sync_en = test_config.dst_full_sync_en, - .compile_args = compute_kernel_args, - .defines = defines}); + tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en, + .compile_args = compute_kernel_args, .defines = defines}); SetRuntimeArgs( program, @@ -172,6 +155,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c uint32_t(test_config.ublock_size_tiles), }); + //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// @@ -191,6 +175,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c dram_buffer_size_bfp16b, in2_val); + //////////////////////////////////////////////////////////////////////////// // Golden Generation //////////////////////////////////////////////////////////////////////////// @@ -202,44 +187,21 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c // 19 bits since that's the width of srcA/B/FPU. This is why it's // float32 in golden. As for golden1, it should be Bfp8_b in the end, // but since there's no available conversion from Float16_b to Bfp8_b, - // it remains in float and is then converted to Bfp8_b. + // it's left in float and then converted to Bfp8_b. std::vector temp_golden(input1.size()); - - // It's tricky to make a variable-type vector, so create two for each case - // of fp32_dest_acc_en, fp32 when true, fp16 when false - std::vector golden0_fp32(input1.size()); - std::vector golden0_bfp16(input1.size()); - // This vector will hold unpacked Bfp8 result: + std::vector golden0(input1.size()); std::vector golden1(input1.size()); - // This vector will hold packed fp16_b/fp32 result: - std::vector packed_golden0(input1.size()); for (auto i = 0; i < temp_golden.size(); i++) { - // Do temp = SrcA + SrcB: temp_golden[i] = input1[i].to_float() + bfloat16(input0[i]).to_float(); - // Do temp + DST, store in out0 vector depending on fp32_dest_acc_en: - if (test_config.fp32_dest_acc_en) { - golden0_fp32[i] = temp_golden[i] + input2[i].to_float(); - } else { - golden0_bfp16[i] = bfloat16(temp_golden[i] + input2[i].to_float()); - } - // Do out1 = temp + DST: + golden0[i] = bfloat16(temp_golden[i] + input2[i].to_float()); golden1[i] = bfloat16(temp_golden[i] + input2[i].to_float()).to_float(); - // Do out0[bfp16] = temp + L1, this makes sense only if not fp32_dest_acc_en: - if (test_config.l1_acc && !test_config.fp32_dest_acc_en) { - golden0_bfp16[i] = bfloat16(golden0_bfp16[i].to_float() + out0_result_old); + if (test_config.l1_acc) { + golden0[i] = bfloat16(golden0[i].to_float() + out0_result_old); } else { - out0_result_old = golden0_bfp16[i].to_float(); - } - // Cast float32 to "packed "uint32 out0 vector if fp32_dest_acc_en: - if (test_config.fp32_dest_acc_en) { - packed_golden0[i] = std::bit_cast(golden0_fp32[i]); + out0_result_old = golden0[i].to_float(); } } - // Pack out0 vector if not fp32_dest_acc_en: - if (!test_config.fp32_dest_acc_en) { - packed_golden0 = pack_vector(golden0_bfp16); - } - // Pack out1 vector: + std::vector packed_golden0 = pack_vector(golden0); std::vector packed_golden1 = pack_fp32_vec_as_bfp8_tiles(golden1, true, false); // //////////////////////////////////////////////////////////////////////////// @@ -297,67 +259,95 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c dest0_buffer_data, packed_golden0, [&](const bfloat16& a, const bfloat16& b) { - return is_close(a, b, 0.0155f); + return is_close(a, b, 0.015f); }); pass &= is_close_packed_vectors( dest1_buffer_data, packed_golden1, [&](const bfloat16& a, const bfloat16& b) { - return is_close(a, b, 0.0155); + return is_close(a, b, 0.015f); }); return pass; } } // namespace unit_tests::compute::binary -//////////////////////////////////////////////////////////////////////////// -// Test Description -// ------------------------------------------------------------------------ -// These tests aim to cover usage of these API calls: -// - copy_tile_init -// - copy_tile_to_dst_init_short -// - copy_tile_to_dst_init_short_with_dt -// - unpack_reconfig_data_format -// - unpack_reconfig_data_format_srca -// - unpack_reconfig_data_format_srcb -// - pack_reconfig_l1_acc -//////////////////////////////////////////////////////////////////////////// - -TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) { +TEST_F(DeviceFixture, TileCopyReconfigExplicitSplit) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); } - for (bool explicit_reconfig : {true, false}) { - for (bool split_src_reconfig : {true, false}) { - for (bool fp32_dest_acc_en : {true, false}) { - for (bool block_copy : {true, false}) { - for (bool dst_full_sync_en : {true, false}) { - log_info(LogTest, "Block Copy = {}, " - "Explicit = {}, " - "Split = {}, " - "FP32DestAcc = {}" - "DstSyncFull = {}.", - block_copy, - explicit_reconfig, - split_src_reconfig, - fp32_dest_acc_en, - dst_full_sync_en); - unit_tests::compute::reconfig::ReconfigConfig test_config = { - .num_tiles = 1, - .ublock_size_tiles = 1, - .explicit_reconfig = explicit_reconfig, - .split_src_reconfig = split_src_reconfig, - .fp32_dest_acc_en = fp32_dest_acc_en, - .block_copy = block_copy, - .dst_full_sync_en = dst_full_sync_en - }; - for (unsigned int id = 0; id < num_devices_; id++) { - ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); - } - } - } - } + + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::reconfig::ReconfigConfig test_config = { + .num_tiles = 1, + .ublock_size_tiles = 1, + .explicit_reconfig = true, + .split_src_reconfig = true, + .dst_full_sync_en = dst_full_sync_en + }; + for (unsigned int id = 0; id < num_devices_; id++) { + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); + } + } +} + +TEST_F(DeviceFixture, TileCopyReconfigExplicitJoined) { + auto arch = this->arch_; + if (arch == tt::ARCH::GRAYSKULL) { + GTEST_SKIP(); + } + + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::reconfig::ReconfigConfig test_config = { + .num_tiles = 1, + .ublock_size_tiles = 1, + .explicit_reconfig = true, + .split_src_reconfig = false, + .dst_full_sync_en = dst_full_sync_en + }; + for (unsigned int id = 0; id < num_devices_; id++) { + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); + } + } +} + +TEST_F(DeviceFixture, TileCopyReconfigImplicitSplit) { + auto arch = this->arch_; + if (arch == tt::ARCH::GRAYSKULL) { + GTEST_SKIP(); + } + + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::reconfig::ReconfigConfig test_config = { + .num_tiles = 1, + .ublock_size_tiles = 1, + .explicit_reconfig = false, + .split_src_reconfig = true, + .dst_full_sync_en = dst_full_sync_en + }; + for (unsigned int id = 0; id < num_devices_; id++) { + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); + } + } +} + +TEST_F(DeviceFixture, TileCopyReconfigImplicitJoined) { + auto arch = this->arch_; + if (arch == tt::ARCH::GRAYSKULL) { + GTEST_SKIP(); + } + + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::reconfig::ReconfigConfig test_config = { + .num_tiles = 1, + .ublock_size_tiles = 1, + .explicit_reconfig = false, + .split_src_reconfig = false, + .dst_full_sync_en = dst_full_sync_en + }; + for (unsigned int id = 0; id < num_devices_; id++) { + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); } } } @@ -367,17 +357,20 @@ TEST_F(DeviceFixture, TileCopyReconfigL1Acc) { if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); } - for (bool l1_acc : {true, false}) { - for (bool dst_full_sync_en : {true, false}) { - log_info(LogTest, "L1 accumulation is {}, DstSyncFull = {}", l1_acc ? "on." : "off.", dst_full_sync_en); - unit_tests::compute::reconfig::ReconfigConfig test_config = { - .num_tiles = 1, - .ublock_size_tiles = 1, - .dst_full_sync_en = dst_full_sync_en - }; - for (unsigned int id = 0; id < num_devices_; id++) { - ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); - } + + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::reconfig::ReconfigConfig test_config = { + .num_tiles = 1, + .ublock_size_tiles = 1, + .dst_full_sync_en = dst_full_sync_en + }; + for (unsigned int id = 0; id < num_devices_; id++) { + test_config.l1_acc = false; + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); + log_info(LogTest, "Passed without L1 accumulation"); + test_config.l1_acc = true; + ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config)); + log_info(LogTest, "Passed with L1 accumulation"); } } } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp index ba147a19ed5..c12dfb809be 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp @@ -51,11 +51,8 @@ struct ReduceConfig { std::function(const std::vector&, const std::vector&, float, uint8_t, bool)> golden_function; std::vector result_shape; bool math_only_reduce = false; - // Whether or not we want the result to be stored in DST in FP32: bool fp32_dest_acc_en = false; - // Whether or not to sync full/half DST between MATH and PACK: bool dst_full_sync_en = false; - bool at_start = false; MathFidelity math_fidelity = MathFidelity::HiFi4; }; @@ -294,7 +291,6 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig uint(Ht), uint(Wt), uint(NC), - test_config.at_start, }; std::map reduce_defines = { @@ -370,13 +366,6 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig log_error(LogTest, "Failure position={}", argfail); EXPECT_TRUE(pass); - log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}, at_start = {}", - test_config.math_fidelity, - test_config.reduce_type, - test_config.fp32_dest_acc_en, - test_config.dst_full_sync_en, - test_config.at_start - ); } } // namespace unit_tests::compute::reduce @@ -396,25 +385,23 @@ TEST_F(DeviceFixture, ComputeReduceH) { for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::H, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_h, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid), - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::H, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_h, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid), + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -429,27 +416,25 @@ TEST_F(DeviceFixture, ComputeReduceW) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::W, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_w, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid), - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::W, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_w, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid), + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -464,28 +449,26 @@ TEST_F(DeviceFixture, ComputeReduceHW) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - // Currently fp32 dest unsupported with reduce scalar - if (fp32_dest_acc_en) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::HW, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_hw, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + // Currently fp32 dest unsupported with reduce scalar + if (fp32_dest_acc_en) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::HW, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_hw, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -505,26 +488,24 @@ TEST_F(DeviceFixture, ComputeReduceHMathOnly) { for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::H, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_h, - .result_shape = result_shape, - .math_only_reduce = true, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::H, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_h, + .result_shape = result_shape, + .math_only_reduce = true, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -539,28 +520,26 @@ TEST_F(DeviceFixture, ComputeReduceWMathOnly) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::W, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_w, - .result_shape = result_shape, - .math_only_reduce = true, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::W, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_w, + .result_shape = result_shape, + .math_only_reduce = true, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -575,29 +554,27 @@ TEST_F(DeviceFixture, ComputeReduceHWMathOnly) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - // Currently fp32 dest unsupported with reduce scalar - if (fp32_dest_acc_en) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .shape = shape, - .reduce_dim = ReduceDim::HW, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_hw, - .result_shape = result_shape, - .math_only_reduce = true, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + // Currently fp32 dest unsupported with reduce scalar + if (fp32_dest_acc_en) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .shape = shape, + .reduce_dim = ReduceDim::HW, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_hw, + .result_shape = result_shape, + .math_only_reduce = true, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -617,26 +594,24 @@ TEST_F(DeviceFixture, ComputeReduceHShortInit) { for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .short_init = true, - .shape = shape, - .reduce_dim = ReduceDim::H, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_h, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .short_init = true, + .shape = shape, + .reduce_dim = ReduceDim::H, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_h, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -651,28 +626,26 @@ TEST_F(DeviceFixture, ComputeReduceWShortInit) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .short_init = true, - .shape = shape, - .reduce_dim = ReduceDim::W, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_w, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .short_init = true, + .shape = shape, + .reduce_dim = ReduceDim::W, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_w, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } @@ -687,29 +660,27 @@ TEST_F(DeviceFixture, ComputeReduceHWShortInit) { if (math_fid == 1) continue; for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) { for (bool fp32_dest_acc_en : {true, false}) { - // Currently fp32 dest unsupported with reduce scalar - if (fp32_dest_acc_en) continue; for (bool dst_full_sync_en : {true, false}) { - for (bool at_start : {true, false}) { - ReduceConfig test_config = { - .short_init = true, - .shape = shape, - .reduce_dim = ReduceDim::HW, - .reduce_type = ReduceType(reduce_type), - .data_gen_rand_max = 10.0f, - .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), - .data_gen_offset = -10.0f, - .atol = 1e-2f, - .rtol = 0.08f, - .golden_function = unit_tests::compute::gold_reduce_hw, - .result_shape = result_shape, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .at_start = at_start, - .math_fidelity = MathFidelity(math_fid) - }; - run_single_core_reduce_program(this->devices_.at(0), test_config); - } + // Currently fp32 dest unsupported with reduce scalar + if (fp32_dest_acc_en) continue; + log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en); + ReduceConfig test_config = { + .short_init = true, + .shape = shape, + .reduce_dim = ReduceDim::HW, + .reduce_type = ReduceType(reduce_type), + .data_gen_rand_max = 10.0f, + .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(), + .data_gen_offset = -10.0f, + .atol = 1e-2f, + .rtol = 0.08f, + .golden_function = unit_tests::compute::gold_reduce_hw, + .result_shape = result_shape, + .fp32_dest_acc_en = fp32_dest_acc_en, + .dst_full_sync_en = dst_full_sync_en, + .math_fidelity = MathFidelity(math_fid) + }; + run_single_core_reduce_program(this->devices_.at(0), test_config); } } } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp index 91071283a51..c96a44be6d2 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp @@ -20,6 +20,7 @@ using namespace tt; using namespace tt::test_utils; +using namespace tt::test_utils::df; using namespace tt::tt_metal; namespace unit_tests::compute::tilize { @@ -41,21 +42,13 @@ using GoldenFunc = std::variant< std::function(const std::vector&, const std::vector&, const GoldenConfig &config)> >; struct TestConfig { - // Whether or not to use *_init_short LLK API calls: bool short_init = false; - // Whether or not to sync full/half DST between MATH and PACK: bool dst_full_sync_en = false; - // Whether or not we want the result to be stored in DST in FP32 is - // controlled with this flag: - bool fp32_dest_acc_en = false; uint32_t input_single_tile_size; uint32_t output_single_tile_size; - // Block height in tiles: uint32_t num_tiles_r; - // Block width in tiles: uint32_t num_tiles_c; uint32_t num_faces_per_tile = 4; - // Face height in datums: uint32_t face_r_dim = 16; std::optional untilize_type = std::nullopt; std::optional tilize_type = std::nullopt; @@ -68,6 +61,8 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& CoreCoord core = {0, 0}; uint32_t num_tiles = test_config.num_tiles_r * test_config.num_tiles_c; + log_info(tt::LogTest, "Running test for num_tiles_r = {}, num_tiles_c = {}", test_config.num_tiles_r, test_config.num_tiles_c); + uint32_t input_dram_buffer_size = test_config.input_single_tile_size * num_tiles; uint32_t output_dram_buffer_size = test_config.output_single_tile_size * num_tiles; @@ -118,9 +113,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& uint32_t ouput_cb_index = 16; // output operands start at index 16 uint32_t num_output_tiles = num_tiles; - tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig( - num_output_tiles * test_config.output_single_tile_size, - {{ouput_cb_index, test_config.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b}}) + tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * test_config.output_single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) .set_page_size(ouput_cb_index, test_config.output_single_tile_size); auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); @@ -130,7 +123,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& } else if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp"; } else { - reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp"; + reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; } auto reader_kernel = tt_metal::CreateKernel( @@ -168,22 +161,17 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& std::map defines = {}; - if (test_config.short_init) { + if (test_config.short_init) + { defines["SHORT_INIT"] = "1"; } - if (test_config.fp32_dest_acc_en) { - defines["DST_ACCUM_MODE"] = "1"; - } auto eltwise_unary_kernel = tt_metal::CreateKernel( program, compute_kernel, core, - tt_metal::ComputeConfig{ - .fp32_dest_acc_en = test_config.fp32_dest_acc_en, - .dst_full_sync_en = test_config.dst_full_sync_en, - .compile_args = compute_kernel_args, - .defines = defines} + tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en, + .compile_args = compute_kernel_args, .defines = defines} ); std::vector src0_vec = create_arange_vector_of_bfloat16(input_dram_buffer_size, false); @@ -217,10 +205,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& {dram_buffer_src0_addr, (std::uint32_t)dram_src0_noc_xy.x, (std::uint32_t)dram_src0_noc_xy.y, - num_tiles, - src0_cb_index, - test_config.num_tiles_c, - false + num_tiles }); } @@ -260,23 +245,12 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& } }, test_config.golden_function); - - if(test_config.fp32_dest_acc_en) { - vector golden_unpacked = unpack_vector(golden); - // Increasing the size since from BFP16 two times, since storing is in FP32 - golden.resize(golden.size() * 2); - for (auto i = 0; i < golden_unpacked.size(); i++) { - // Cast float32 to "packed "uint32 golden vector if fp32_dest_acc_en: - golden[i] = std::bit_cast(golden_unpacked[i].to_float()); - } - } - if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { pass &= (golden.size() == result_vec.size()); - pass &= is_close_packed_vectors( + pass &= is_close_packed_vectors( result_vec, golden, - [&](const bfloat16& a, const bfloat16& b) { + [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.01f); }); @@ -287,17 +261,12 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& if (not pass){ std::cout << "GOLDEN " << std::endl; - print_vector(unpack_vector(golden)); + print_vector(unpack_vector(golden)); std::cout << "RESULTS " << std::endl; - print_vector(unpack_vector(result_vec)); + print_vector(unpack_vector(result_vec)); } ASSERT_TRUE(pass); - log_info(tt::LogTest, "Done running test with: num_tiles_r = {}, num_tiles_c = {}, FP32_DestAcc = {}, DstSyncFull = {}, pass = {}", - test_config.num_tiles_r, - test_config.num_tiles_c, - test_config.fp32_dest_acc_en, - test_config.dst_full_sync_en, - pass); + log_info(tt::LogTest, "Done running test for num_tiles_r = {}, num_tiles_c = {}, pass = {}", test_config.num_tiles_r, test_config.num_tiles_c, pass); } } // namespace unit_tests::compute::tilize @@ -307,24 +276,19 @@ Following tests are for Unpack Tilize ***************************************/ TEST_F(DeviceFixture, ComputeUnpackTilize) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS and unpack_tilize hangs on BH -> tt-metal/#13640 - if ((fp32_dest_acc_en == true) && (this->arch_ != tt::ARCH::WORMHOLE_B0)) continue; - for (bool dst_full_sync_en : {true, false}) { - unit_tests::compute::tilize::TestConfig test_config = { - .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, - .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), - .num_tiles_r = num_tile[0], - .num_tiles_c = num_tile[1], - .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A, - .golden_function = unit_tests::compute::gold_standard_tilize - }; - unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::tilize::TestConfig test_config = { + .dst_full_sync_en = dst_full_sync_en, + .input_single_tile_size = 2 * 1024, + .output_single_tile_size = 2 * 1024, + .num_tiles_r = num_tile[0], + .num_tiles_c = num_tile[1], + .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A, + .golden_function = unit_tests::compute::gold_standard_tilize + }; + unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); } } } @@ -334,6 +298,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) { if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); } + for (bool dst_full_sync_en : {true, false}) { unit_tests::compute::tilize::TestConfig test_config = { .dst_full_sync_en = dst_full_sync_en, @@ -349,25 +314,20 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) { } TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS and unpack_tilize hangs on BH -> tt-metal/#13640 - if ((fp32_dest_acc_en == true) && (this->arch_ != tt::ARCH::WORMHOLE_B0)) continue; - for (bool dst_full_sync_en : {true, false}) { + for (bool dst_full_sync_en : {true, false}) { unit_tests::compute::tilize::TestConfig test_config = { .short_init = true, .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), + .output_single_tile_size = 2 * 1024, .num_tiles_r = num_tile[0], .num_tiles_c = num_tile[1], .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A, .golden_function = unit_tests::compute::gold_standard_tilize }; unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } } } } @@ -377,48 +337,38 @@ Following tests are for Unpack Untilize ***************************************/ TEST_F(DeviceFixture, ComputeUnpackUntilize) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - unit_tests::compute::tilize::TestConfig test_config = { - .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, - .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), - .num_tiles_r = num_tile[0], - .num_tiles_c = num_tile[1], - .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK, - .golden_function = unit_tests::compute::gold_standard_untilize - }; - unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::tilize::TestConfig test_config = { + .dst_full_sync_en = dst_full_sync_en, + .input_single_tile_size = 2 * 1024, + .output_single_tile_size = 2 * 1024, + .num_tiles_r = num_tile[0], + .num_tiles_c = num_tile[1], + .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK, + .golden_function = unit_tests::compute::gold_standard_untilize + }; + unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); } } } TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - unit_tests::compute::tilize::TestConfig test_config = { - .short_init = true, - .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, - .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), - .num_tiles_r = num_tile[0], - .num_tiles_c = num_tile[1], - .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK, - .golden_function = unit_tests::compute::gold_standard_untilize - }; - unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::tilize::TestConfig test_config = { + .short_init = true, + .dst_full_sync_en = dst_full_sync_en, + .input_single_tile_size = 2 * 1024, + .output_single_tile_size = 2 * 1024, + .num_tiles_r = num_tile[0], + .num_tiles_c = num_tile[1], + .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK, + .golden_function = unit_tests::compute::gold_standard_untilize + }; + unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); } } } @@ -427,54 +377,44 @@ TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) { Following tests are for pack untilize ***************************************/ TEST_F(DeviceFixture, ComputePackUntilize) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - unit_tests::compute::tilize::TestConfig test_config = { - .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, - .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), - .num_tiles_r = num_tile[0], - .num_tiles_c = num_tile[1], - .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK, - .golden_function = unit_tests::compute::gold_standard_untilize - }; - unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::tilize::TestConfig test_config = { + .dst_full_sync_en = dst_full_sync_en, + .input_single_tile_size = 2 * 1024, + .output_single_tile_size = 2 * 1024, + .num_tiles_r = num_tile[0], + .num_tiles_c = num_tile[1], + .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK, + .golden_function = unit_tests::compute::gold_standard_untilize + }; + unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); } } } TEST_F(DeviceFixture, ComputePackUntilizeShortInit) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { - for (bool fp32_dest_acc_en : {true, false}) { - // FP32 dest acc not possible for GS - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - unit_tests::compute::tilize::TestConfig test_config = { - .short_init = true, - .dst_full_sync_en = dst_full_sync_en, - .fp32_dest_acc_en = fp32_dest_acc_en, - .input_single_tile_size = 2 * 1024, - .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2), - .num_tiles_r = num_tile[0], - .num_tiles_c = num_tile[1], - .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK, - .golden_function = unit_tests::compute::gold_standard_untilize - }; - unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); - } + for (bool dst_full_sync_en : {true, false}) { + unit_tests::compute::tilize::TestConfig test_config = { + .short_init = true, + .dst_full_sync_en = dst_full_sync_en, + .input_single_tile_size = 2 * 1024, + .output_single_tile_size = 2 * 1024, + .num_tiles_r = num_tile[0], + .num_tiles_c = num_tile[1], + .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK, + .golden_function = unit_tests::compute::gold_standard_untilize + }; + unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config); } } } TEST_F(DeviceFixture, ComputePackUntilizeDst) { - vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; + vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool dst_full_sync_en : {true, false}) { unit_tests::compute::tilize::TestConfig test_config = { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp index 8a57be5afc7..1ef579e6ed5 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp @@ -19,25 +19,14 @@ using namespace tt; using namespace tt::test_utils; -namespace unit_tests_common::matmul::test_matmul_X_tile{ -struct MatmulTileStimuli { - vector t; // Raw tensor values - vector a; // Activations - vector w; // Weights -}; +namespace unit_tests_common::matmul::test_matmul_X_tile{ struct MatmulTileConfig { uint32_t M, K, N; - // Whether or not to add matmul result with bias: bool with_bias = false; - // Whether or not to use *_init_short LLK API calls: bool test_init_short = false; - // Whether or not to use *_with_dt LLK API init calls: bool with_dt = true; - // Whether or not we want the result to be stored in DST in FP32: - bool fp32_dest_acc_en = false; - // Whether or not to sync full/half DST between MATH and PACK: bool dst_full_sync_en = false; string reader_kernel; string compute_kernel; @@ -45,31 +34,6 @@ struct MatmulTileConfig { MathFidelity math_fidelity = MathFidelity::HiFi4; }; -void create_test_stimuli(MatmulTileStimuli &stimuli, uint32_t M, uint32_t K, uint32_t N) { - SHAPE shape = {1, 1, M * 32, K * 32}; - tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor( - shape, - tt::deprecated::Initialize::RANDOM, - 100, - std::chrono::system_clock::now().time_since_epoch().count() - ); - stimuli.t = tensor.get_values(); - - auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); - auto activations_tile_layout = convert_to_tile_layout(activations_tilized); - auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); - stimuli.a = activations_tile_transposed; - - auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); - auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); - auto weights_tile_layout = convert_to_tile_layout(identity_tilized); - auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); - stimuli.w = weights; - -} - -// This function creates bit masks to model math fidelity phases. This will mask the result only. void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = MathFidelity::HiFi4) { auto arch = get_arch_from_string(get_env_arch_name()); switch (math_fidelity) { @@ -81,9 +45,11 @@ void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = Ma } } -void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector activations, vector weights, vector tensor_vals){ +bool matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector activations, vector weights, deprecated::Tensor tensor){ + bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); + CoreCoord core = {0, 0}; // num_tile == M == N == K in the case of multi_tile, conveniently they were all the same!! @@ -91,29 +57,25 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT uint32_t M = cfg.M; uint32_t K = cfg.K; uint32_t N = cfg.N; - uint32_t num_tiles = M * K; // only if M = K = N - uint32_t single_tile_size_fp32 = 4 * 32 * 32; // Single 32x32 tile size for Float32 - uint32_t single_tile_size_bfp16b = 2 * 32 * 32; // Single 32x32 tile size for Float16_b / Uint16 - uint32_t single_tile_size_out0 = cfg.fp32_dest_acc_en ? single_tile_size_fp32 : single_tile_size_bfp16b; - const size_t dram_buffer_size_bfp16b = num_tiles * single_tile_size_bfp16b; - const size_t dram_buffer_size_out0 = num_tiles * single_tile_size_out0; - - tt_metal::InterleavedBufferConfig input_dram_config{ - .device=device, - .size = dram_buffer_size_bfp16b, - .page_size = dram_buffer_size_bfp16b, - .buffer_type = tt_metal::BufferType::DRAM - }; - tt_metal::InterleavedBufferConfig output_dram_config{ + uint32_t single_tile_size = 2 * 1024; + uint32_t num_tiles = M * K; // only if M = K = N + uint32_t dram_buffer_size = single_tile_size * num_tiles; + // for multi_tile case buffer size will vary depending on M, N, K + // uint32_t dram_buffer_size_act = single_tile_size * M * K; // num_tiles of FP16_B, hard-coded in the reader/writer kernels + // uint32_t dram_buffer_size_weights = single_tile_size * K * N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels + // uint32_t dram_buffer_size_out = single_tile_size * M * N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels + + + tt_metal::InterleavedBufferConfig dram_config{ .device=device, - .size = dram_buffer_size_out0, - .page_size = dram_buffer_size_out0, + .size = dram_buffer_size, + .page_size = dram_buffer_size, .buffer_type = tt_metal::BufferType::DRAM }; - auto src0_dram_buffer = CreateBuffer(input_dram_config); - auto src1_dram_buffer = CreateBuffer(input_dram_config); - auto dst_dram_buffer = CreateBuffer(output_dram_config); + auto src0_dram_buffer = CreateBuffer(dram_config); + auto src1_dram_buffer = CreateBuffer(dram_config); + auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t num_input_tiles = 2 * M; @@ -122,13 +84,13 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); uint32_t src0_cb_index = 0; - tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src0_cb_index, single_tile_size_bfp16b); + tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); uint32_t src1_cb_index = 1; - tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src1_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src1_cb_index, single_tile_size_bfp16b); + tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src1_cb_index, single_tile_size); auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config); std::shared_ptr src2_dram_buffer; @@ -136,24 +98,24 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT if (cfg.with_bias) { // with_bias only when M, N, or K > 1 tt_metal::InterleavedBufferConfig bias_config{ .device=device, - .size = single_tile_size_bfp16b * N, - .page_size = single_tile_size_bfp16b * N, + .size = single_tile_size * N, + .page_size = single_tile_size * N, .buffer_type = tt_metal::BufferType::DRAM }; src2_dram_buffer = CreateBuffer(bias_config); uint32_t src2_cb_index = 2; - tt_metal::CircularBufferConfig cb_src2_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src2_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src2_cb_index, single_tile_size_bfp16b); + tt_metal::CircularBufferConfig cb_src2_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src2_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src2_cb_index, single_tile_size); auto cb_src2 = tt_metal::CreateCircularBuffer(program, core, cb_src2_config); - } else if (cfg.test_init_short) { // This will be dummy input in uint16_t + } else if (cfg.test_init_short) {// This will be dummy input in uint16_t uint32_t in2_id = 2; uint32_t out1_id = 17; tt_metal::InterleavedBufferConfig dummy_config{ .device=device, - .size = single_tile_size_bfp16b * N, - .page_size = single_tile_size_bfp16b * N, + .size = single_tile_size * N, + .page_size = single_tile_size * N, .buffer_type = tt_metal::BufferType::DRAM }; @@ -164,13 +126,13 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT dst1_dram_buffer = CreateBuffer(dummy_config); tt_metal::CircularBufferConfig cb_src2_config = - tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{in2_id, tt::DataFormat::UInt16}}) - .set_page_size(in2_id, single_tile_size_bfp16b); + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{in2_id, tt::DataFormat::UInt16}}) + .set_page_size(in2_id, single_tile_size); auto cb_src2 = tt_metal::CreateCircularBuffer(program, core, cb_src2_config); tt_metal::CircularBufferConfig cb_dst1_config = - tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{out1_id, tt::DataFormat::UInt16}}) - .set_page_size(out1_id, single_tile_size_bfp16b); + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{out1_id, tt::DataFormat::UInt16}}) + .set_page_size(out1_id, single_tile_size); auto cb_dst1 = tt_metal::CreateCircularBuffer(program, core, cb_dst1_config); } @@ -179,14 +141,14 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT if (cfg.M > 1 || cfg.N > 1 || cfg.K > 1){ uint32_t intermediate_cb_index = 24; std::map partials_and_out_data_format_spec = { - {ouput_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}, - {intermediate_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)} + {ouput_cb_index, tt::DataFormat::Float16_b}, + {intermediate_cb_index, tt::DataFormat::Float16_b} }; CoreRangeSet cores(std::set{CoreRange(core, core)}); - tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(dram_buffer_size_out0, partials_and_out_data_format_spec) - .set_page_size(ouput_cb_index, single_tile_size_out0) - .set_page_size(intermediate_cb_index, single_tile_size_out0); + tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_tiles * single_tile_size, partials_and_out_data_format_spec) + .set_page_size(ouput_cb_index, single_tile_size) + .set_page_size(intermediate_cb_index, single_tile_size); auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); reader_l1_args = { @@ -199,15 +161,14 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT (std::uint32_t)K, (std::uint32_t)M, (std::uint32_t)N, - (std::uint32_t)(M * single_tile_size_bfp16b), - (std::uint32_t)(N * single_tile_size_bfp16b), + (std::uint32_t)(M * single_tile_size), + (std::uint32_t)(N * single_tile_size), cfg.with_bias }; } else { uint32_t num_output_tiles = 2; - tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size_out0, - {{ouput_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}}) - .set_page_size(ouput_cb_index, single_tile_size_out0); + tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(ouput_cb_index, single_tile_size); auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); reader_l1_args = { @@ -220,17 +181,23 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT 1, 1, 1, - 1 * single_tile_size_bfp16b, - 1 * single_tile_size_bfp16b + 1 * single_tile_size, + 1 * single_tile_size }; } std::map compute_defines; - compute_defines["WITH_DT"] = cfg.with_dt ? "1" : "0"; - compute_defines["TEST_INIT_SHORT"] = cfg.test_init_short ? "1" : "0"; - if (cfg.fp32_dest_acc_en) - compute_defines["DST_ACCUM_MODE"] = "1"; + if (cfg.with_dt) { + compute_defines["WITH_DT"] = "1"; + } else { + compute_defines["WITH_DT"] = "0"; + } + if (cfg.test_init_short) { + compute_defines["TEST_INIT_SHORT"] = "1"; + } else { + compute_defines["TEST_INIT_SHORT"] = "0"; + } auto mm_reader_kernel = tt_metal::CreateKernel( program, @@ -248,12 +215,11 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT program, cfg.compute_kernel, core, - tt_metal::ComputeConfig{ - .math_fidelity = cfg.math_fidelity, - .fp32_dest_acc_en = cfg.fp32_dest_acc_en, - .dst_full_sync_en = cfg.dst_full_sync_en, - .compile_args = cfg.compute_kernel_args, - .defines = compute_defines}); + tt_metal::ComputeConfig{.math_fidelity = cfg.math_fidelity, + .dst_full_sync_en = cfg.dst_full_sync_en, + .compile_args = cfg.compute_kernel_args, + .defines = compute_defines} + ); fixture->WriteBuffer(device, src0_dram_buffer, activations); fixture->WriteBuffer(device, src1_dram_buffer, weights); @@ -268,10 +234,10 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT (std::uint32_t)dram_src2_noc_xy.x, (std::uint32_t)dram_src2_noc_xy.y, (std::uint32_t)N, - (std::uint32_t)(N * single_tile_size_bfp16b) + (std::uint32_t)(N * single_tile_size) }; - for (uint32_t arg : bias_args) { + for (uint32_t arg: bias_args) { reader_l1_args.push_back(arg); } } @@ -293,30 +259,27 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT fixture->RunProgram(device, program); - // This is tilized result, will not be modified - std::vector result_vec; + vector result_vec; fixture->ReadBuffer(device, dst_dram_buffer, result_vec); - std::vector golden = tensor_vals; - std::vector golden_tilized = test_utils::tilize(golden, M*32, N*32); - std::vector golden_tilized_single = convert_to_tile_layout(golden_tilized); + auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec); + auto result_flat_layout = convert_to_flat_layout(result_bfp16); + auto golden = tensor.get_values(); + auto result_untilized = test_utils::untilize(result_flat_layout, M*32, N*32); - std::vector golden_packed(golden_tilized_single.size()); uint16_t math_fid_mask = 0xFFFF; set_math_fid_masks(math_fid_mask, cfg.math_fidelity); - for (auto i = 0; i < golden_tilized.size(); i++) { - golden_tilized_single[i] = bfloat16(golden_tilized_single[i].to_uint16() & math_fid_mask); - if (cfg.fp32_dest_acc_en) { - golden_packed[i] = std::bit_cast(golden_tilized_single[i].to_float()); - } + // If we're testing LoFi/HiFi2 we generate matching golden (trunc LSB). + // Note that this will work only for multiplying with identity matrix + for (auto i = 0; i < golden.size(); i++) { + golden[i] = bfloat16(golden[i].to_uint16() & math_fid_mask); } - if (!cfg.fp32_dest_acc_en) { - golden_packed = pack_bfloat16_vec_into_uint32_vec(golden_tilized_single); + if (cfg.M > 1 || cfg.N > 1 || cfg.K > 1){ + pass &= (golden == result_untilized); + } else { + pass &= (golden == result_flat_layout); // src1 is all 0's } - EXPECT_EQ(golden_packed.size(), result_vec.size()); - EXPECT_EQ(golden_packed, result_vec); - DeallocateBuffer(*src0_dram_buffer); DeallocateBuffer(*src1_dram_buffer); if (cfg.with_bias || cfg.test_init_short) { @@ -326,176 +289,225 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT DeallocateBuffer(*src2_dram_buffer); } DeallocateBuffer(*dst_dram_buffer); - - tt::log_info(tt::LogTest, "Math Fidelity = {}, FP32_DestAcc = {}, DstSyncFull = {}", - cfg.math_fidelity, - cfg.fp32_dest_acc_en, - cfg.dst_full_sync_en - ); + return pass; } } // namespace unit_tests_common::matmul::test_matmul_X_tile -using namespace tt::test_utils; -using namespace unit_tests_common::matmul::test_matmul_X_tile; - -/* matmul_config.compute_kernel_args = { - // block_tile_dim, within block, how many tiles are on the K dim - // dst_tile_rows - // dst_tile_cols - // block_cnt, across blocks, how many tiles are on the K dim - // in0_block_tile_cnt, M * block_tile_dim - // in1_block_tile_cnt, N * block_tile_dim - // out_block_tile_cnt -} -*/ - TEST_F(CommonFixture, MatmulSingleTile){ - for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { - if (i == 1) continue; - for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - MatmulTileConfig matmul_config = { - .M = 1, .K = 1, .N = 1, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", - .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", - .compute_kernel_args = {1, 1, 1, 1, 1, 1, 1}, - .math_fidelity = MathFidelity(i) - }; - MatmulTileStimuli stimuli; - create_test_stimuli(stimuli, 1, 1, 1); - - for(unsigned int id = 0; id < devices_.size(); id++){ - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - } + for (bool dst_full_sync_en : {true, false}) { + for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { + if (i == 1) continue; + unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = { + .M = 1, .K = 1, .N = 1, + .dst_full_sync_en = dst_full_sync_en, + .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp", + .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp", + .compute_kernel_args = { + 1, // block_tile_dim + 1, // dst_tile_rows + 1, // dst_tile_cols + 1, // block_cnt + 1, // in0_block_tile_cnt + 1, // in1_block_tile_cnt + 1 // out_block_tile_cnt + }, + .math_fidelity = MathFidelity(i) + }; + SHAPE shape = {1, 1, 32, 32}; + tt::log_info(tt::LogTest, "Math Fidelity = {}", i); + tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count()); + auto activations_tile_layout = convert_to_tile_layout(tensor.get_values()); + auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); + + auto identity = create_identity_matrix(32, 32, 32); //bfloat16 32x32 identity + auto weights_tile_layout = convert_to_tile_layout(identity); + auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); + + for(unsigned int id = 0; id < devices_.size(); id++){ + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations, weights, tensor)); } } } } TEST_F(CommonFixture, MatmulMultiTile){ - for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { - if (i == 1) continue; - for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - uint32_t M = fp32_dest_acc_en ? 2 : 4; - uint32_t N = fp32_dest_acc_en ? 2 : 4; - uint32_t K = fp32_dest_acc_en ? 2 : 4; - MatmulTileConfig matmul_config = { - .M = M, .K = K, .N = N, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", - .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp", - .compute_kernel_args = {1, M, N, K, M, N, (M * N), matmul_config.with_bias}, - .math_fidelity = MathFidelity(i) - }; - MatmulTileStimuli stimuli; - create_test_stimuli(stimuli, M, K, N); - - for(unsigned int id = 0; id < devices_.size(); id++){ - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - log_info(LogTest, "Multi tile with no bias passed"); - matmul_config.with_bias = true; - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - log_info(LogTest, "Multi tile with bias passed"); - } + for (bool dst_full_sync_en : {true, false}) { + for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { + if (i == 1) continue; + uint32_t M = 4; + uint32_t N = 4; + uint32_t K = 4; + unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = { + .M = M, .K = K, .N = N, + .dst_full_sync_en = dst_full_sync_en, + .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", + .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp", + .compute_kernel_args = { + 1, // block_tile_dim, within block, how many tiles are on the K dim + M, // dst_tile_rows + N, // dst_tile_cols + K, // block_cnt, across blocks, how many tiles are on the K dim + M, // in0_block_tile_cnt, M * block_tile_dim + N, // in1_block_tile_cnt, N * block_tile_dim + (M * N), // out_block_tile_cnt + matmul_config.with_bias // whether or not to use bias + }, + .math_fidelity = MathFidelity(i) + }; + tt::log_info(tt::LogTest, "Math Fidelity = {}", i); + SHAPE shape = {1, 1, M * 32, K * 32}; + tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count()); + auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); + auto activations_tile_layout = convert_to_tile_layout(activations_tilized); + auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); + auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); + + auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity + auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); + auto weights_tile_layout = convert_to_tile_layout(identity_tilized); + auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); + + for(unsigned int id = 0; id < devices_.size(); id++){ + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor)); + log_info(LogTest, "Multi tile with no bias passed"); + matmul_config.with_bias = true; + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor)); + log_info(LogTest, "Multi tile with bias passed"); } } } } TEST_F(CommonFixture, MatmulBlock){ - for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { - if (i == 1) continue; - for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - uint32_t M = fp32_dest_acc_en ? 2 : 4; - uint32_t N = fp32_dest_acc_en ? 2 : 4; - uint32_t K = fp32_dest_acc_en ? 2 : 4; - MatmulTileConfig matmul_config = { - .M = M, .K = K, .N = N, - .test_init_short = false, - .with_dt = false, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", - .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", - .compute_kernel_args = {1, M, N, K, M, N, (M * N)}, - .math_fidelity = MathFidelity(i) - }; - MatmulTileStimuli stimuli; - create_test_stimuli(stimuli, M, K, N); - - for(unsigned int id = 0; id < devices_.size(); id++){ - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - } + for (bool dst_full_sync_en : {true, false}) { + for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { + if (i == 1) continue; + uint32_t M = 4; + uint32_t N = 4; + uint32_t K = 4; + unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = { + .M = M, .K = K, .N = N, + .test_init_short = false, + .with_dt = false, + .dst_full_sync_en = dst_full_sync_en, + .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", + .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", + .compute_kernel_args = { + 1, // block_tile_dim, within block, how many tiles are on the K dim + M, // dst_tile_rows + N, // dst_tile_cols + K, // block_cnt, across blocks, how many tiles are on the K dim + M, // in0_block_tile_cnt, M * block_tile_dim + N, // in1_block_tile_cnt, N * block_tile_dim + (M * N), // out_block_tile_cnt + }, + .math_fidelity = MathFidelity(i) + }; + tt::log_info(tt::LogTest, "Math Fidelity = {}", i); + SHAPE shape = {1, 1, M * 32, K * 32}; + tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count()); + auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); + auto activations_tile_layout = convert_to_tile_layout(activations_tilized); + auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); + auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); + + auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity + auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); + auto weights_tile_layout = convert_to_tile_layout(identity_tilized); + auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); + + for(unsigned int id = 0; id < devices_.size(); id++){ + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor)); } } } } TEST_F(CommonFixture, MatmulBlockInitShort){ - for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { - if (i == 1) continue; - for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - uint32_t M = fp32_dest_acc_en ? 2 : 4; - uint32_t N = fp32_dest_acc_en ? 2 : 4; - uint32_t K = fp32_dest_acc_en ? 2 : 4; - MatmulTileConfig matmul_config = { - .M = M, .K = K, .N = N, - .test_init_short = true, - .with_dt = false, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", - .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", - .compute_kernel_args = {1, M, N, K, M, N, (M * N)}, - .math_fidelity = MathFidelity(i) - }; - MatmulTileStimuli stimuli; - create_test_stimuli(stimuli, M, K, N); - - for(unsigned int id = 0; id < devices_.size(); id++){ - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - } + for (bool dst_full_sync_en : {true, false}) { + for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { + if (i == 1) continue; + uint32_t M = 4; + uint32_t N = 4; + uint32_t K = 4; + unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = { + .M = M, .K = K, .N = N, + .test_init_short = true, + .with_dt = false, + .dst_full_sync_en = dst_full_sync_en, + .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", + .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", + .compute_kernel_args = { + 1, // block_tile_dim, within block, how many tiles are on the K dim + M, // dst_tile_rows + N, // dst_tile_cols + K, // block_cnt, across blocks, how many tiles are on the K dim + M, // in0_block_tile_cnt, M * block_tile_dim + N, // in1_block_tile_cnt, N * block_tile_dim + (M * N), // out_block_tile_cnt + }, + .math_fidelity = MathFidelity(i) + }; + tt::log_info(tt::LogTest, "Math Fidelity = {}", i); + SHAPE shape = {1, 1, M * 32, K * 32}; + tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count()); + auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); + auto activations_tile_layout = convert_to_tile_layout(activations_tilized); + auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); + auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); + + auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity + auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); + auto weights_tile_layout = convert_to_tile_layout(identity_tilized); + auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); + + for(unsigned int id = 0; id < devices_.size(); id++){ + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor)); } } } } TEST_F(CommonFixture, MatmulBlockInitShortWithDt){ - for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { - if (i == 1) continue; - for (bool fp32_dest_acc_en : {true, false}) { - if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; - for (bool dst_full_sync_en : {true, false}) { - uint32_t M = fp32_dest_acc_en ? 2 : 4; - uint32_t N = fp32_dest_acc_en ? 2 : 4; - uint32_t K = fp32_dest_acc_en ? 2 : 4; - MatmulTileConfig matmul_config = { - .M = M, .K = K, .N = N, - .test_init_short = true, - .with_dt = true, - .fp32_dest_acc_en = fp32_dest_acc_en, - .dst_full_sync_en = dst_full_sync_en, - .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", - .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", - .compute_kernel_args = {1, M, N, K, M, N, (M * N)}, - .math_fidelity = MathFidelity(i) - }; - MatmulTileStimuli stimuli; - create_test_stimuli(stimuli, M, K, N); - - for(unsigned int id = 0; id < devices_.size(); id++){ - matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t); - } + for (bool dst_full_sync_en : {true, false}) { + for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { + if (i == 1) continue; + uint32_t M = 4; + uint32_t N = 4; + uint32_t K = 4; + unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = { + .M = M, .K = K, .N = N, + .test_init_short = true, + .with_dt = true, + .dst_full_sync_en = dst_full_sync_en, + .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp", + .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp", + .compute_kernel_args = { + 1, // block_tile_dim, within block, how many tiles are on the K dim + M, // dst_tile_rows + N, // dst_tile_cols + K, // block_cnt, across blocks, how many tiles are on the K dim + M, // in0_block_tile_cnt, M * block_tile_dim + N, // in1_block_tile_cnt, N * block_tile_dim + (M * N), // out_block_tile_cnt + }, + .math_fidelity = MathFidelity(i) + }; + tt::log_info(tt::LogTest, "Math Fidelity = {}", i); + SHAPE shape = {1, 1, M * 32, K * 32}; + tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count()); + auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); + auto activations_tile_layout = convert_to_tile_layout(activations_tilized); + auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); + auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); + + auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity + auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); + auto weights_tile_layout = convert_to_tile_layout(identity_tilized); + auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); + + for(unsigned int id = 0; id < devices_.size(); id++){ + ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor)); } } }