Add tiny tile support for Tensor, matmul (#12908)

* #0: add forward backward path for tiny tile on host * #0: add matmul (reuse config) support for tiny tiles * #0: add tiny tile support for matmul (dram sharded) * #0: add tiny tile support for matmul_1d_2d * #0: cleanup tile enums, add some warnings * #0: move tile to new file, add hash to desc, debug scan OOM * #0: minor comment fix * #0: fix sdpa code size issue * #0: add const ref to loops
tenstorrent · Sep 23, 2024 · 3e52dc0 · 3e52dc0
1 parent 310a219
commit 3e52dc0
Show file tree

Hide file tree

Showing 61 changed files with 1,698 additions and 470 deletions.
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -163,7 +163,7 @@ int main(int argc, char **argv) {
                 ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
             // convert the reference broadcast tensor to tiled format
             tiled_bcast_values = convert_layout<uint16_t>(
-                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
             TT_FATAL(tiled_bcast_values[0] == bcast_1value16, "Error");
             // restore ref values and shape to 1
             ref_bcast_shape[3] = 1;
@@ -183,7 +183,7 @@ int main(int argc, char **argv) {
                 // add something not too large but different between tiles
                 ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
             tiled_bcast_values = convert_layout<uint16_t>(
-                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
             num_bcast_tiles = NC*Wt;
             // restore values and shape to W
         } else if (bcast_dim == BcastDim::W) {
@@ -194,7 +194,7 @@ int main(int argc, char **argv) {
                 // add something not too large but different between tiles
                 ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
             tiled_bcast_values = convert_layout<uint16_t>(
-                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+                ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
             num_bcast_tiles = NC*Ht;
         }
 
@@ -292,7 +292,7 @@ int main(int argc, char **argv) {
 
         tt_metal::detail::LaunchProgram(device, program);
 
-        // The kernel will view the input as TILED32_4FACES
+        // The kernel will view the input as TILED_NFACES
         vector<uint32_t> result_vec;
         tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec);
 
@@ -313,15 +313,15 @@ int main(int argc, char **argv) {
         // recover a linear view of input vector for consumption by gold_ function
         auto u16_src0_vec = u16_from_u32_vector(src0_vec);
         vector<uint16_t> src_linear = convert_layout<uint16_t>(
-            u16_src0_vec, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
+            u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
         vector<uint16_t> gold_added = gold_bcast_op(
             src_linear, shape, ref_bcast_values, bcast_dim, bcast_op); // result is uint16_t untilized
 
         // Tilize from row major and convert to pairs (uint32_t)
         vector<uint32_t> shapeR{shape[0], shape[1], shape[2], shape[3]};
         auto gold_4f_u32 = u32_from_u16_vector(
             convert_layout<uint16_t>(
-                gold_added, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
+                gold_added, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
 
         pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
         if (!pass)

diff --git a/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp b/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
@@ -32,7 +32,7 @@ int main(int argc, char **argv) {
         }
 
         std::vector<uint32_t> shape_vec = {1, num_tiles, 32, 32};
-        std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+        std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
 
         std::vector<uint32_t> packed_bfp4b_tile_vec_rm_in = pack_fp32_vec_as_bfp4_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
         std::vector<float> unpacked_bfp4b_tile_vec_rm_out = unpack_bfp4_tiles_into_float_vec(packed_bfp4b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
@@ -44,8 +44,8 @@ int main(int argc, char **argv) {
         // ////////////////////////////////////////////////////////////////////////////
         // //                      Validation
         // ////////////////////////////////////////////////////////////////////////////
-        std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
-        std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+        std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
+        std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
 
         // Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
         pass &= (packed_bfp4b_tile_vec_rm_in == packed_bfp4b_tile_vec_tile_in);

diff --git a/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp b/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
@@ -32,7 +32,7 @@ int main(int argc, char **argv) {
         }
 
         std::vector<uint32_t> shape_vec = {1, 1, 32, 32};
-        std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+        std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
 
         std::vector<uint32_t> packed_bfp8b_tile_vec_rm_in = pack_fp32_vec_as_bfp8_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
         std::vector<float> unpacked_bfp8b_tile_vec_rm_out = unpack_bfp8_tiles_into_float_vec(packed_bfp8b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
@@ -44,8 +44,8 @@ int main(int argc, char **argv) {
         // ////////////////////////////////////////////////////////////////////////////
         // //                      Validation
         // ////////////////////////////////////////////////////////////////////////////
-        std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
-        std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+        std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
+        std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
 
         // Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
         pass &= (packed_bfp8b_tile_vec_rm_in == packed_bfp8b_tile_vec_tile_in);

diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp
@@ -164,13 +164,13 @@ int main(int argc, char **argv) {
             vector<uint32_t> shapeC = {1, B, Mt*32, Nt*32};
             auto u16_src0_vec = u16_from_u32_vector(src0_vec);
             auto u16_src1_vec = u16_from_u32_vector(src1_vec);
-            vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
-            vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
+            vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
+            vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
             vector<uint16_t> ref_bmm = gold_bmm(shapeA, src0_linear, shapeB, src1_linear);
 
             // Tilize gold from row major and convert to pairs (uint32_t)
             auto gold_4f_u32 = u32_from_u16_vector( convert_layout<uint16_t>(
-                ref_bmm, shapeC, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
+                ref_bmm, shapeC, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
 
             pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
             if (!pass)

diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
@@ -184,12 +184,12 @@ int main(int argc, char **argv) {
         };
 
         // recover a linear view of input vector for consumption by gold_ function
-        vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
+        vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
         vector<uint16_t> gold_reduced = gold_transpose_hc(src_linear, shape); // result is uint16_t untilized
 
         // Tilize from row major and convert to pairs (uint32_t)
         vector<uint32_t> shapeR{shape[0], shape[2], shape[1], shape[3]};
-        auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES);
+        auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
         auto gold_4f_u32 = u32_from_u16_vector(gold_16_4f);
         auto u16_result = u16_from_u32_vector(result_vec);
 

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
@@ -326,7 +326,7 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
 
     tt_metal::detail::LaunchProgram(device, program);
 
-    // The kernel will view the input as TILED32_4FACES
+    // The kernel will view the input as TILED_NFACES
     std::vector<uint32_t> result_vec;
     tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec);
 
@@ -353,11 +353,11 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
         }
     }
     // recover a linear view of input vector for consumption by gold_ function
-    std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
+    std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
     std::vector<uint16_t> gold_reduced = test_config.golden_function(src_linear, test_config.shape, scaler, uint8_t(test_config.reduce_type), true); // result is uint16_t untilized
 
     // Tilize from row major and convert to pairs (uint32_t)
-    auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
+    auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
 
     bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
     if (!pass)

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
@@ -52,13 +52,13 @@ void validate_transpose_wh(const std::vector<uint32_t> &src_vec, const std::vect
 
     // recover a linear view of input vector for consumption by gold_ function
     auto u16_src0_vec = u16_from_u32_vector(src_vec);
-    vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, TensorLayout::TILED32_4FACES, TensorLayout::LIN_ROW_MAJOR);
+    vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
     vector<uint16_t> gold_reduced = gold_transpose_wh(src_linear, shape); // result is uint16_t untilized
 
     // Tilize from row major and convert to pairs (uint32_t)
     TT_FATAL(shape.size() == 4, "Error");
     vector<uint32_t> shapeR{shape[0], shape[1], shape[3], shape[2]};
-    auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED32_4FACES));
+    auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
 
     bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
     if (not pass) {