#13127: Clean up layout conversion

- Deprecate support for ROW_MAJOR BFLOAT8_B/BFLOAT4_B tensors on host * Throw warning when users try to convert layout for BFLOAT8_B/BFLOAT4_B dtypes * Throw warning and return original input tensor instead of assert in pad op if input is not ROW_MAJOR layout ** This is a workaround to support old test code that tries to: 1. Create BFLOAT8_B tensors in ROW_MAJOR 2. Pad up to tile sizes 3. Convert layout to TILE ** Now, we just directly pad and return in TILE layout on the first step * Update tensor creation with float vectors to always output TILE layout for BFLOAT8_B/BFLOAT4_B dtypes ** Automatically add padding to tile sizes if needed ** Throw warning if requested layout is not TILE ** Remove duplicate code by calling a helper function for the different pybinds ** Remove support for BFLOAT8_B/BFLOAT4_B in create_owned_buffer_from_vector_of_floats * Update conv weights utils to do layout conversion in FLOAT32, then unpack and re-pack into BFLOAT8_B/BFLOAT4_B ** This is strictly better for host-side perf since we eliminate one unecessary pack * Remove duplicate code in handling of BFLOAT8_B/BFLOAT4_B dtypes in pytensor - Update convert_layout to use 2D physical shape for all underlying conversions * Top level convert layout has a pass to flatten shape for existing use cases * Remove padding support for tilize_nchw since this is unused at this level * Remove recursive calls inside convert layout to simplify logic * Switch convert_layout_row_major_to_tile and convert_layout_tile_to_row_major to use 2D physical shape * Update tests/tt_metal/tt_metal/test_bcast.cpp to handle padding for tile externally - Fix segfault when calling tensor creation with float vectors with device = None
tenstorrent · Dec 2, 2024 · ab3dc0c · ab3dc0c
1 parent 5df8746
commit ab3dc0c
Show file tree

Hide file tree

Showing 9 changed files with 265 additions and 315 deletions.
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -154,28 +154,30 @@ int main(int argc, char** argv) {
 
                 vector<uint16_t> tiled_bcast_values;
                 vector<uint16_t> ref_bcast_values;
-                vector<uint32_t> ref_bcast_shape = {N, C, 1, 1};
                 float bcast_1value = 10.0f;
                 uint16_t bcast_1value16 = bfloat16(bcast_1value).to_uint16();
                 unsigned num_bcast_tiles = 0;
                 // build the constant tiles to be broadcast
                 if (bcast_dim == BcastDim::HW) {
-                    num_bcast_tiles = NC;
                     ref_bcast_values.resize(NC, 0);
+                    vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, TILE_HEIGHT, TILE_WIDTH};
+                    vector<uint16_t> ref_bcast_values_with_tile_padding;
+                    ref_bcast_values_with_tile_padding.resize(NC * TILE_HEIGHT * TILE_WIDTH, 0);
                     for (int j = 0; j < NC; j++) {
                         // add something not too large but different between tiles
-                        ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        ref_bcast_values[j] = val;
+                        ref_bcast_values_with_tile_padding[j * TILE_HEIGHT * TILE_WIDTH] = val;
                     }
                     // convert the reference broadcast tensor to tiled format
                     tiled_bcast_values = convert_layout<uint16_t>(
-                        ref_bcast_values,
-                        ref_bcast_shape,
+                        ref_bcast_values_with_tile_padding,
+                        ref_bcast_shape_with_tile_padding,
                         tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
                         tests::utils::TensorLayoutType::TILED_NFACES);
                     TT_FATAL(tiled_bcast_values[0] == bcast_1value16, "Error");
+                    num_bcast_tiles = NC;
                     // restore ref values and shape to 1
-                    ref_bcast_shape[3] = 1;
-                    ref_bcast_shape[4] = 1;
                 } else if (bcast_dim == BcastDim::H) {
                     // For bcast_h a.k.a. Dim::R we broadcast _over_ H, meaning we take a W vector and += it over each
                     // element in the H dimension At least that's the behavior i've seen from a single tile bcast-H So
@@ -185,29 +187,37 @@ int main(int argc, char** argv) {
                     // generate broadcast values along the W axis with one extra tile (needed by the kernel I believe)
                     // TODO(AP): need to figure out why the extra tile in broadcast inputs is expected by the kernel
                     ref_bcast_values.resize(NC * W, 0);
-                    ref_bcast_shape[3] = W;
+                    vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, TILE_HEIGHT, W};
+                    vector<uint16_t> ref_bcast_values_with_tile_padding;
+                    ref_bcast_values_with_tile_padding.resize(NC * TILE_HEIGHT * W, 0);
                     for (int j = 0; j < NC * W; j++) {
                         // add something not too large but different between tiles
-                        ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        ref_bcast_values[j] = val;
+                        ref_bcast_values_with_tile_padding[j % W + (j / W) * TILE_HEIGHT * W] = val;
                     }
                     tiled_bcast_values = convert_layout<uint16_t>(
-                        ref_bcast_values,
-                        ref_bcast_shape,
+                        ref_bcast_values_with_tile_padding,
+                        ref_bcast_shape_with_tile_padding,
                         tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
                         tests::utils::TensorLayoutType::TILED_NFACES);
                     num_bcast_tiles = NC * Wt;
                     // restore values and shape to W
                 } else if (bcast_dim == BcastDim::W) {
                     // see the comments above for BCAST_H
                     ref_bcast_values.resize(NC * H, 0);
-                    ref_bcast_shape[2] = H;
+                    vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, H, TILE_WIDTH};
+                    vector<uint16_t> ref_bcast_values_with_tile_padding;
+                    ref_bcast_values_with_tile_padding.resize(NC * H * TILE_WIDTH, 0);
                     for (int j = 0; j < NC * H; j++) {
                         // add something not too large but different between tiles
-                        ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
+                        ref_bcast_values[j] = val;
+                        ref_bcast_values_with_tile_padding[j * TILE_WIDTH] = val;
                     }
                     tiled_bcast_values = convert_layout<uint16_t>(
-                        ref_bcast_values,
-                        ref_bcast_shape,
+                        ref_bcast_values_with_tile_padding,
+                        ref_bcast_shape_with_tile_padding,
                         tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
                         tests::utils::TensorLayoutType::TILED_NFACES);
                     num_bcast_tiles = NC * Ht;

diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
@@ -77,16 +77,14 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
     tt_tensor = ttnn.Tensor(py_tensor, tt_dtype)
     if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
         assert tt_tensor.storage_type() == ttnn.StorageType.OWNED
-        tt_tensor = tt_tensor.to(ttnn.TILE_LAYOUT)
+        assert tt_tensor.layout == ttnn.TILE_LAYOUT
     else:
         assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
+        assert tt_tensor.layout == ttnn.ROW_MAJOR_LAYOUT
 
     tt_tensor = tt_tensor.to(device)
     tt_tensor = tt_tensor.cpu()
 
-    if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
-        tt_tensor = tt_tensor.to(ttnn.ROW_MAJOR_LAYOUT)
-
     if python_lib == torch:
         py_tensor_after_round_trip = tt_tensor.to_torch()
     elif python_lib == np:

diff --git a/tt_metal/common/test_tiles.hpp b/tt_metal/common/test_tiles.hpp
@@ -25,13 +25,15 @@ enum class TensorLayoutType {
 };
 } // namespace tests::utils
 
+using PhysicalSize = std::array<uint32_t, 2>;
+
 template <class T, template <typename...> typename BufferType>
 std::vector<T> convert_to_tile_layout(
     const BufferType<T>& data,
-    std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
-    std::optional<tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
-    const std::optional<bool>& transpose_within_face = std::nullopt,
-    const std::optional<bool>& transpose_of_faces = std::nullopt) {
+    std::optional<PhysicalSize> tile_shape = std::nullopt,
+    std::optional<PhysicalSize> face_shape = std::nullopt,
+    const bool transpose_face = false,
+    const bool transpose_face_order = false) {
     ZoneScoped;
     std::vector<T> result;
     if(data.size() == 0) {
@@ -45,8 +47,6 @@ std::vector<T> convert_to_tile_layout(
     auto face_W = face_shape.has_value() ? face_shape.value()[1] : tt::constants::FACE_WIDTH;
     auto tile_HW = tile_H * tile_W;
     auto face_HW = face_H * face_W;
-    bool transpose_face = transpose_within_face.has_value() ? transpose_within_face.value() : false;
-    bool transpose_face_order = transpose_of_faces.has_value() ? transpose_of_faces.value() : false;
     TT_ASSERT(data.size() % tile_HW == 0);
     int num_tiles = data.size() / tile_HW;
     for(int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
@@ -116,10 +116,10 @@ std::vector<T> convert_to_tile_layout(
 template <class T, template <typename...> typename BufferTyp>
 std::vector<T> convert_to_flat_layout(
     const BufferTyp<T>& data,
-    std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
-    std::optional<tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
-    const std::optional<bool>& transpose_within_face = std::nullopt,
-    const std::optional<bool>& transpose_of_faces = std::nullopt) {
+    std::optional<PhysicalSize> tile_shape = std::nullopt,
+    std::optional<PhysicalSize> face_shape = std::nullopt,
+    const bool transpose_face = false,
+    const bool transpose_face_order = false) {
     ZoneScoped;
     std::vector<T> result;
     if(data.size() == 0) {
@@ -134,8 +134,6 @@ std::vector<T> convert_to_flat_layout(
     auto face_HW = face_H * face_W;
     auto num_faces_col = tile_W / face_W;
     auto num_faces_row = tile_H / face_H;
-    bool transpose_face = transpose_within_face.has_value() ? transpose_within_face.value() : false;
-    bool transpose_face_order = transpose_of_faces.has_value() ? transpose_of_faces.value() : false;
     TT_ASSERT(data.size() % tile_HW == 0);
     int num_tiles = data.size() / tile_HW;
     for(int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
@@ -194,38 +192,35 @@ std::vector<T> convert_to_flat_layout(
 
 // Converts a 32-swizzled tilized row-major tensor to a linear 32-zero-padded row-major tensor
 template <typename T, template <typename...> typename BufferType>
-inline std::vector<T> untilize_nchw(const BufferType<T>& in, tt::stl::Span<const uint32_t> shape, std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt) {
+inline std::vector<T> untilize_nchw(
+    const BufferType<T>& in, const PhysicalSize& shape, std::optional<PhysicalSize> tile_shape = std::nullopt) {
     ZoneScoped;
-    auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
-    auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;
-
     std::vector<T> result;
     if(in.size() == 0) {
         return result;
     }
 
-    TT_ASSERT(shape[shape.size() - 2] % tile_H == 0 && shape[shape.size() - 1] % tile_W == 0);
+    auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
+    auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;
+
+    TT_ASSERT(shape[0] % tile_H == 0 && shape[1] % tile_W == 0);
 
     // Untilize into row major
-    uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
-    uint64_t batch_size = 1;
-    for (uint32_t i = 0; i < shape.size() - 2; i++) {
-        batch_size *= shape[i];
-    }
-    result.resize(batch_size * H * W);
+    uint32_t H = shape[0];
+    uint32_t W = shape[1];
+
+    result.resize(H * W);
     uint64_t linear = 0;
-    for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
-        for (auto hs = 0; hs < H; hs += tile_H) {        // iterate over h with stride 32
-            for (auto ws = 0; ws < W; ws += tile_W) {    // iterate over w with stride 32
-                for (auto ht = 0; ht < tile_H; ht++) {      // hs + ht = h
-                    for (auto wt = 0; wt < tile_W; wt++) {  // ws + wt = w
-                        T val = in[linear];
-                        auto w = wt + ws;
-                        auto h = ht + hs;
-                        auto offs = w + h * W + batch_index * H * W;
-                        result[offs] = val;
-                        linear++;
-                    }
+    for (auto hs = 0; hs < H; hs += tile_H) {           // iterate over h with stride 32
+        for (auto ws = 0; ws < W; ws += tile_W) {       // iterate over w with stride 32
+            for (auto ht = 0; ht < tile_H; ht++) {      // hs + ht = h
+                for (auto wt = 0; wt < tile_W; wt++) {  // ws + wt = w
+                    T val = in[linear];
+                    auto w = wt + ws;
+                    auto h = ht + hs;
+                    auto offs = w + h * W;  // + batch_index * H * W;
+                    result[offs] = val;
+                    linear++;
                 }
             }
         }
@@ -240,50 +235,42 @@ inline std::uint32_t round_up_to_mul32(std::uint32_t val) { return ((val & 31) =
 
 inline std::uint32_t round_up_to_tile(int val, int tile_val) { return (val + tile_val - 1) & ~(tile_val - 1); }
 
-// Converts a linear non-zero-padded row-major tensor to zero-padded-32 32-swizzled tilized row-major tensor
+// Converts a linear non-zero-padded row-major tensor to 32-swizzled tilized row-major tensor
 template <typename T, template <typename...> typename BufferType>
-inline std::vector<T> tilize_nchw(const BufferType<T>& in_rowmajor, tt::stl::Span<const uint32_t> shape, std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt) {
+inline std::vector<T> tilize_nchw(
+    const BufferType<T>& in_rowmajor,
+    const PhysicalSize& shape,
+    std::optional<PhysicalSize> tile_shape = std::nullopt) {
     ZoneScoped;
     std::vector<T> tilized_result;
     if(in_rowmajor.size() == 0) {
         return tilized_result;
     }
 
-    uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
-    uint64_t batch_size = 1;
-    for (uint32_t i = 0; i < shape.size() - 2; i++) {
-        batch_size *= shape[i];
-    }
-    uint64_t input_volume = batch_size * H * W;
     auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
     auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;
-    uint32_t OH = round_up_to_tile(H, tile_H);
-    uint32_t OW = round_up_to_tile(W, tile_W);
-    tilized_result.resize(batch_size * OH * OW);
-    std::fill(tilized_result.begin(), tilized_result.end(), 0);
+
+    TT_ASSERT(shape[0] % tile_H == 0 && shape[1] % tile_W == 0);
+
+    uint32_t H = shape[0];
+    uint32_t W = shape[1];
+
+    tilized_result.resize(H * W);
     uint64_t out_index = 0;
-    for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
-        for (auto hs = 0; hs < H; hs += tile_H) {
-            for (auto ws = 0; ws < W; ws += tile_W) {
-                for (auto ht = 0; ht < tile_H; ht++) {
-                    for (auto wt = 0; wt < tile_W; wt++) {
-                        auto w = wt + ws;
-                        auto h = ht + hs;
-                        auto in_offs = w + h * W + batch_index * H * W;
-                        auto val = (w >= W || h >= H || in_offs >= input_volume) ? 0 : in_rowmajor[in_offs];
-                        auto out_w = (out_index % OW);
-                        auto out_h = (out_index / OW) % OH;
-                        TT_ASSERT(w < OW);
-                        TT_ASSERT(h < OH);
-                        auto out_offs = out_w + out_h * OW + batch_index * OH * OW;
-                        tilized_result[out_offs] = val;
-                        out_index++;
-                    }
+    for (auto hs = 0; hs < H; hs += tile_H) {
+        for (auto ws = 0; ws < W; ws += tile_W) {
+            for (auto ht = 0; ht < tile_H; ht++) {
+                for (auto wt = 0; wt < tile_W; wt++) {
+                    auto w = wt + ws;
+                    auto h = ht + hs;
+                    auto in_offs = w + h * W;
+                    auto val = in_rowmajor[in_offs];
+                    tilized_result[out_index] = val;
+                    out_index++;
                 }
             }
         }
     }
-    TT_ASSERT(tilized_result.size() == batch_size * OH * OW);
 
     return tilized_result;
 }
@@ -308,13 +295,13 @@ struct TensAddr {
 template <typename T, template <typename...> typename BufferType>
 inline std::vector<T> convert_layout(
     const BufferType<T>& inp,
-    tt::stl::Span<const uint32_t> shape,
+    const PhysicalSize& shape,
     tests::utils::TensorLayoutType inL,
     tests::utils::TensorLayoutType outL,
-    std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
-    std::optional<const tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
-    const std::optional<bool>& transpose_within_face = std::nullopt,
-    const std::optional<bool>& transpose_of_faces = std::nullopt) {
+    std::optional<PhysicalSize> tile_shape = std::nullopt,
+    std::optional<PhysicalSize> face_shape = std::nullopt,
+    const bool transpose_within_face = false,
+    const bool transpose_of_faces = false) {
     ZoneScoped;
     if(inp.size() == 0) {
         return std::vector<T>();
@@ -333,16 +320,18 @@ inline std::vector<T> convert_layout(
             if (outL == tests::utils::TensorLayoutType::TILED_SWIZZLED) {
                 return tilize_nchw<T>(inp, shape, tile_shape);
             } else if (outL == tests::utils::TensorLayoutType::TILED_NFACES) {
-                auto swiz32 = convert_layout<T>(inp, shape, inL, tests::utils::TensorLayoutType::TILED_SWIZZLED, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
-                return convert_layout<T>(swiz32, shape, tests::utils::TensorLayoutType::TILED_SWIZZLED, outL, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
+                auto swiz32 = tilize_nchw<T>(inp, shape, tile_shape);
+                return convert_to_tile_layout<T>(
+                    swiz32, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
             } else
                 TT_ASSERT(false && "Unsupported conversion.");
         break;
         case tests::utils::TensorLayoutType::TILED_NFACES:
             if (outL == tests::utils::TensorLayoutType::TILED_SWIZZLED) {
                 return convert_to_flat_layout<T>(inp, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
             } else if (outL == tests::utils::TensorLayoutType::LIN_ROW_MAJOR) {
-                auto swiz32 = convert_layout<T>(inp, shape, inL, tests::utils::TensorLayoutType::TILED_SWIZZLED, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
+                auto swiz32 =
+                    convert_to_flat_layout<T>(inp, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
                 return untilize_nchw<T>(swiz32, shape, tile_shape);
             } else {
                 TT_ASSERT(false && "Unsupported conversion");
@@ -353,3 +342,25 @@ inline std::vector<T> convert_layout(
     }
     return std::vector<T>();
 }
+
+template <typename T, template <typename...> typename BufferType>
+inline std::vector<T> convert_layout(
+    const BufferType<T>& inp,
+    tt::stl::Span<const uint32_t> shape,
+    tests::utils::TensorLayoutType inL,
+    tests::utils::TensorLayoutType outL,
+    std::optional<PhysicalSize> tile_shape = std::nullopt,
+    std::optional<PhysicalSize> face_shape = std::nullopt,
+    const bool transpose_within_face = false,
+    const bool transpose_of_faces = false) {
+    ZoneScoped;
+
+    TT_ASSERT(shape.size() >= 2, "Shape size {} must be at least rank 2!", shape.size());
+    uint32_t H = shape[shape.size() - 2];
+    uint32_t W = shape[shape.size() - 1];
+    for (int i = 0; i < shape.size() - 2; i++) {
+        H *= shape[i];
+    }
+    return convert_layout<T, BufferType>(
+        inp, PhysicalSize{H, W}, inL, outL, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
+}