Skip to content

Commit

Permalink
Add tiny tile support for Tensor, matmul (#12908)
Browse files Browse the repository at this point in the history
* #0: add forward backward path for tiny tile on host

* #0: add matmul (reuse config) support for tiny tiles

* #0: add tiny tile support for matmul (dram sharded)

* #0: add tiny tile support for matmul_1d_2d

* #0: cleanup tile enums, add some warnings

* #0: move tile to new file, add hash to desc, debug scan OOM

* #0: minor comment fix

* #0: fix sdpa code size issue

* #0: add const ref to loops
  • Loading branch information
yugaoTT authored Sep 23, 2024
1 parent 310a219 commit 3e52dc0
Show file tree
Hide file tree
Showing 61 changed files with 1,698 additions and 470 deletions.
12 changes: 6 additions & 6 deletions tests/tt_metal/tt_metal/test_bcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ int main(int argc, char **argv) {
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
// convert the reference broadcast tensor to tiled format
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
TT_FATAL(tiled_bcast_values[0] == bcast_1value16, "Error");
// restore ref values and shape to 1
ref_bcast_shape[3] = 1;
Expand All @@ -183,7 +183,7 @@ int main(int argc, char **argv) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
num_bcast_tiles = NC*Wt;
// restore values and shape to W
} else if (bcast_dim == BcastDim::W) {
Expand All @@ -194,7 +194,7 @@ int main(int argc, char **argv) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
num_bcast_tiles = NC*Ht;
}

Expand Down Expand Up @@ -292,7 +292,7 @@ int main(int argc, char **argv) {

tt_metal::detail::LaunchProgram(device, program);

// The kernel will view the input as TILED32_4FACES
// The kernel will view the input as TILED_NFACES
vector<uint32_t> result_vec;
tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec);

Expand All @@ -313,15 +313,15 @@ int main(int argc, char **argv) {
// recover a linear view of input vector for consumption by gold_ function
auto u16_src0_vec = u16_from_u32_vector(src0_vec);
vector<uint16_t> src_linear = convert_layout<uint16_t>(
u16_src0_vec, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> gold_added = gold_bcast_op(
src_linear, shape, ref_bcast_values, bcast_dim, bcast_op); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
vector<uint32_t> shapeR{shape[0], shape[1], shape[2], shape[3]};
auto gold_4f_u32 = u32_from_u16_vector(
convert_layout<uint16_t>(
gold_added, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
gold_added, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));

pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int argc, char **argv) {
}

std::vector<uint32_t> shape_vec = {1, num_tiles, 32, 32};
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);

std::vector<uint32_t> packed_bfp4b_tile_vec_rm_in = pack_fp32_vec_as_bfp4_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
std::vector<float> unpacked_bfp4b_tile_vec_rm_out = unpack_bfp4_tiles_into_float_vec(packed_bfp4b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
Expand All @@ -44,8 +44,8 @@ int main(int argc, char **argv) {
// ////////////////////////////////////////////////////////////////////////////
// // Validation
// ////////////////////////////////////////////////////////////////////////////
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);

// Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
pass &= (packed_bfp4b_tile_vec_rm_in == packed_bfp4b_tile_vec_tile_in);
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int argc, char **argv) {
}

std::vector<uint32_t> shape_vec = {1, 1, 32, 32};
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);

std::vector<uint32_t> packed_bfp8b_tile_vec_rm_in = pack_fp32_vec_as_bfp8_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
std::vector<float> unpacked_bfp8b_tile_vec_rm_out = unpack_bfp8_tiles_into_float_vec(packed_bfp8b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
Expand All @@ -44,8 +44,8 @@ int main(int argc, char **argv) {
// ////////////////////////////////////////////////////////////////////////////
// // Validation
// ////////////////////////////////////////////////////////////////////////////
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);

// Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
pass &= (packed_bfp8b_tile_vec_rm_in == packed_bfp8b_tile_vec_tile_in);
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,13 @@ int main(int argc, char **argv) {
vector<uint32_t> shapeC = {1, B, Mt*32, Nt*32};
auto u16_src0_vec = u16_from_u32_vector(src0_vec);
auto u16_src1_vec = u16_from_u32_vector(src1_vec);
vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> ref_bmm = gold_bmm(shapeA, src0_linear, shapeB, src1_linear);

// Tilize gold from row major and convert to pairs (uint32_t)
auto gold_4f_u32 = u32_from_u16_vector( convert_layout<uint16_t>(
ref_bmm, shapeC, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
ref_bmm, shapeC, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));

pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/test_transpose_hc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,12 @@ int main(int argc, char **argv) {
};

// recover a linear view of input vector for consumption by gold_ function
vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> gold_reduced = gold_transpose_hc(src_linear, shape); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
vector<uint32_t> shapeR{shape[0], shape[2], shape[1], shape[3]};
auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
auto gold_4f_u32 = u32_from_u16_vector(gold_16_4f);
auto u16_result = u16_from_u32_vector(result_vec);

Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig

tt_metal::detail::LaunchProgram(device, program);

// The kernel will view the input as TILED32_4FACES
// The kernel will view the input as TILED_NFACES
std::vector<uint32_t> result_vec;
tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec);

Expand All @@ -353,11 +353,11 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
}
}
// recover a linear view of input vector for consumption by gold_ function
std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<uint16_t> gold_reduced = test_config.golden_function(src_linear, test_config.shape, scaler, uint8_t(test_config.reduce_type), true); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));

bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ void validate_transpose_wh(const std::vector<uint32_t> &src_vec, const std::vect

// recover a linear view of input vector for consumption by gold_ function
auto u16_src0_vec = u16_from_u32_vector(src_vec);
vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> gold_reduced = gold_transpose_wh(src_linear, shape); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
TT_FATAL(shape.size() == 4, "Error");
vector<uint32_t> shapeR{shape[0], shape[1], shape[3], shape[2]};
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));

bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (not pass) {
Expand Down
Loading

0 comments on commit 3e52dc0

Please sign in to comment.