Skip to content

Commit

Permalink
#13127: Clean up layout conversion
Browse files Browse the repository at this point in the history
- Deprecate support for ROW_MAJOR BFLOAT8_B/BFLOAT4_B tensors on host
  * Throw warning when users try to convert layout for BFLOAT8_B/BFLOAT4_B dtypes
  * Throw warning and return original input tensor instead of assert in pad op if input is not ROW_MAJOR layout
    ** This is a workaround to support old test code that tries to:
       1. Create BFLOAT8_B tensors in ROW_MAJOR
       2. Pad up to tile sizes
       3. Convert layout to TILE
    ** Now, we just directly pad and return in TILE layout on the first step
  * Update tensor creation with float vectors to always output TILE layout for BFLOAT8_B/BFLOAT4_B dtypes
    ** Automatically add padding to tile sizes if needed
    ** Throw warning if requested layout is not TILE
    ** Remove duplicate code by calling a helper function for the different pybinds
    ** Remove support for BFLOAT8_B/BFLOAT4_B in create_owned_buffer_from_vector_of_floats
  * Update conv weights utils to do layout conversion in FLOAT32, then unpack and re-pack into BFLOAT8_B/BFLOAT4_B
    ** This is strictly better for host-side perf since we eliminate one unecessary pack
  * Remove duplicate code in handling of BFLOAT8_B/BFLOAT4_B dtypes in pytensor
- Update convert_layout to use 2D physical shape for all underlying conversions
  * Top level convert layout has a pass to flatten shape for existing use cases
  * Remove padding support for tilize_nchw since this is unused at this level
  * Remove recursive calls inside convert layout to simplify logic
  * Switch convert_layout_row_major_to_tile and convert_layout_tile_to_row_major to use 2D physical shape
  * Update tests/tt_metal/tt_metal/test_bcast.cpp to handle padding for tile externally
- Fix segfault when calling tensor creation with float vectors with device = None
  • Loading branch information
TT-BrianLiu committed Dec 2, 2024
1 parent 5df8746 commit ab3dc0c
Show file tree
Hide file tree
Showing 9 changed files with 265 additions and 315 deletions.
40 changes: 25 additions & 15 deletions tests/tt_metal/tt_metal/test_bcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,28 +154,30 @@ int main(int argc, char** argv) {

vector<uint16_t> tiled_bcast_values;
vector<uint16_t> ref_bcast_values;
vector<uint32_t> ref_bcast_shape = {N, C, 1, 1};
float bcast_1value = 10.0f;
uint16_t bcast_1value16 = bfloat16(bcast_1value).to_uint16();
unsigned num_bcast_tiles = 0;
// build the constant tiles to be broadcast
if (bcast_dim == BcastDim::HW) {
num_bcast_tiles = NC;
ref_bcast_values.resize(NC, 0);
vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, TILE_HEIGHT, TILE_WIDTH};
vector<uint16_t> ref_bcast_values_with_tile_padding;
ref_bcast_values_with_tile_padding.resize(NC * TILE_HEIGHT * TILE_WIDTH, 0);
for (int j = 0; j < NC; j++) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
ref_bcast_values[j] = val;
ref_bcast_values_with_tile_padding[j * TILE_HEIGHT * TILE_WIDTH] = val;
}
// convert the reference broadcast tensor to tiled format
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values,
ref_bcast_shape,
ref_bcast_values_with_tile_padding,
ref_bcast_shape_with_tile_padding,
tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
tests::utils::TensorLayoutType::TILED_NFACES);
TT_FATAL(tiled_bcast_values[0] == bcast_1value16, "Error");
num_bcast_tiles = NC;
// restore ref values and shape to 1
ref_bcast_shape[3] = 1;
ref_bcast_shape[4] = 1;
} else if (bcast_dim == BcastDim::H) {
// For bcast_h a.k.a. Dim::R we broadcast _over_ H, meaning we take a W vector and += it over each
// element in the H dimension At least that's the behavior i've seen from a single tile bcast-H So
Expand All @@ -185,29 +187,37 @@ int main(int argc, char** argv) {
// generate broadcast values along the W axis with one extra tile (needed by the kernel I believe)
// TODO(AP): need to figure out why the extra tile in broadcast inputs is expected by the kernel
ref_bcast_values.resize(NC * W, 0);
ref_bcast_shape[3] = W;
vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, TILE_HEIGHT, W};
vector<uint16_t> ref_bcast_values_with_tile_padding;
ref_bcast_values_with_tile_padding.resize(NC * TILE_HEIGHT * W, 0);
for (int j = 0; j < NC * W; j++) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
ref_bcast_values[j] = val;
ref_bcast_values_with_tile_padding[j % W + (j / W) * TILE_HEIGHT * W] = val;
}
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values,
ref_bcast_shape,
ref_bcast_values_with_tile_padding,
ref_bcast_shape_with_tile_padding,
tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
tests::utils::TensorLayoutType::TILED_NFACES);
num_bcast_tiles = NC * Wt;
// restore values and shape to W
} else if (bcast_dim == BcastDim::W) {
// see the comments above for BCAST_H
ref_bcast_values.resize(NC * H, 0);
ref_bcast_shape[2] = H;
vector<uint32_t> ref_bcast_shape_with_tile_padding = {N, C, H, TILE_WIDTH};
vector<uint16_t> ref_bcast_values_with_tile_padding;
ref_bcast_values_with_tile_padding.resize(NC * H * TILE_WIDTH, 0);
for (int j = 0; j < NC * H; j++) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value + (j % 7)).to_uint16();
auto val = bfloat16(bcast_1value + (j % 7)).to_uint16();
ref_bcast_values[j] = val;
ref_bcast_values_with_tile_padding[j * TILE_WIDTH] = val;
}
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values,
ref_bcast_shape,
ref_bcast_values_with_tile_padding,
ref_bcast_shape_with_tile_padding,
tests::utils::TensorLayoutType::LIN_ROW_MAJOR,
tests::utils::TensorLayoutType::TILED_NFACES);
num_bcast_tiles = NC * Ht;
Expand Down
6 changes: 2 additions & 4 deletions tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,14 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
tt_tensor = ttnn.Tensor(py_tensor, tt_dtype)
if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
assert tt_tensor.storage_type() == ttnn.StorageType.OWNED
tt_tensor = tt_tensor.to(ttnn.TILE_LAYOUT)
assert tt_tensor.layout == ttnn.TILE_LAYOUT
else:
assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
assert tt_tensor.layout == ttnn.ROW_MAJOR_LAYOUT

tt_tensor = tt_tensor.to(device)
tt_tensor = tt_tensor.cpu()

if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
tt_tensor = tt_tensor.to(ttnn.ROW_MAJOR_LAYOUT)

if python_lib == torch:
py_tensor_after_round_trip = tt_tensor.to_torch()
elif python_lib == np:
Expand Down
157 changes: 84 additions & 73 deletions tt_metal/common/test_tiles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ enum class TensorLayoutType {
};
} // namespace tests::utils

using PhysicalSize = std::array<uint32_t, 2>;

template <class T, template <typename...> typename BufferType>
std::vector<T> convert_to_tile_layout(
const BufferType<T>& data,
std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
std::optional<tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
const std::optional<bool>& transpose_within_face = std::nullopt,
const std::optional<bool>& transpose_of_faces = std::nullopt) {
std::optional<PhysicalSize> tile_shape = std::nullopt,
std::optional<PhysicalSize> face_shape = std::nullopt,
const bool transpose_face = false,
const bool transpose_face_order = false) {
ZoneScoped;
std::vector<T> result;
if(data.size() == 0) {
Expand All @@ -45,8 +47,6 @@ std::vector<T> convert_to_tile_layout(
auto face_W = face_shape.has_value() ? face_shape.value()[1] : tt::constants::FACE_WIDTH;
auto tile_HW = tile_H * tile_W;
auto face_HW = face_H * face_W;
bool transpose_face = transpose_within_face.has_value() ? transpose_within_face.value() : false;
bool transpose_face_order = transpose_of_faces.has_value() ? transpose_of_faces.value() : false;
TT_ASSERT(data.size() % tile_HW == 0);
int num_tiles = data.size() / tile_HW;
for(int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
Expand Down Expand Up @@ -116,10 +116,10 @@ std::vector<T> convert_to_tile_layout(
template <class T, template <typename...> typename BufferTyp>
std::vector<T> convert_to_flat_layout(
const BufferTyp<T>& data,
std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
std::optional<tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
const std::optional<bool>& transpose_within_face = std::nullopt,
const std::optional<bool>& transpose_of_faces = std::nullopt) {
std::optional<PhysicalSize> tile_shape = std::nullopt,
std::optional<PhysicalSize> face_shape = std::nullopt,
const bool transpose_face = false,
const bool transpose_face_order = false) {
ZoneScoped;
std::vector<T> result;
if(data.size() == 0) {
Expand All @@ -134,8 +134,6 @@ std::vector<T> convert_to_flat_layout(
auto face_HW = face_H * face_W;
auto num_faces_col = tile_W / face_W;
auto num_faces_row = tile_H / face_H;
bool transpose_face = transpose_within_face.has_value() ? transpose_within_face.value() : false;
bool transpose_face_order = transpose_of_faces.has_value() ? transpose_of_faces.value() : false;
TT_ASSERT(data.size() % tile_HW == 0);
int num_tiles = data.size() / tile_HW;
for(int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
Expand Down Expand Up @@ -194,38 +192,35 @@ std::vector<T> convert_to_flat_layout(

// Converts a 32-swizzled tilized row-major tensor to a linear 32-zero-padded row-major tensor
template <typename T, template <typename...> typename BufferType>
inline std::vector<T> untilize_nchw(const BufferType<T>& in, tt::stl::Span<const uint32_t> shape, std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt) {
inline std::vector<T> untilize_nchw(
const BufferType<T>& in, const PhysicalSize& shape, std::optional<PhysicalSize> tile_shape = std::nullopt) {
ZoneScoped;
auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;

std::vector<T> result;
if(in.size() == 0) {
return result;
}

TT_ASSERT(shape[shape.size() - 2] % tile_H == 0 && shape[shape.size() - 1] % tile_W == 0);
auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;

TT_ASSERT(shape[0] % tile_H == 0 && shape[1] % tile_W == 0);

// Untilize into row major
uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
uint64_t batch_size = 1;
for (uint32_t i = 0; i < shape.size() - 2; i++) {
batch_size *= shape[i];
}
result.resize(batch_size * H * W);
uint32_t H = shape[0];
uint32_t W = shape[1];

result.resize(H * W);
uint64_t linear = 0;
for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
for (auto hs = 0; hs < H; hs += tile_H) { // iterate over h with stride 32
for (auto ws = 0; ws < W; ws += tile_W) { // iterate over w with stride 32
for (auto ht = 0; ht < tile_H; ht++) { // hs + ht = h
for (auto wt = 0; wt < tile_W; wt++) { // ws + wt = w
T val = in[linear];
auto w = wt + ws;
auto h = ht + hs;
auto offs = w + h * W + batch_index * H * W;
result[offs] = val;
linear++;
}
for (auto hs = 0; hs < H; hs += tile_H) { // iterate over h with stride 32
for (auto ws = 0; ws < W; ws += tile_W) { // iterate over w with stride 32
for (auto ht = 0; ht < tile_H; ht++) { // hs + ht = h
for (auto wt = 0; wt < tile_W; wt++) { // ws + wt = w
T val = in[linear];
auto w = wt + ws;
auto h = ht + hs;
auto offs = w + h * W; // + batch_index * H * W;
result[offs] = val;
linear++;
}
}
}
Expand All @@ -240,50 +235,42 @@ inline std::uint32_t round_up_to_mul32(std::uint32_t val) { return ((val & 31) =

inline std::uint32_t round_up_to_tile(int val, int tile_val) { return (val + tile_val - 1) & ~(tile_val - 1); }

// Converts a linear non-zero-padded row-major tensor to zero-padded-32 32-swizzled tilized row-major tensor
// Converts a linear non-zero-padded row-major tensor to 32-swizzled tilized row-major tensor
template <typename T, template <typename...> typename BufferType>
inline std::vector<T> tilize_nchw(const BufferType<T>& in_rowmajor, tt::stl::Span<const uint32_t> shape, std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt) {
inline std::vector<T> tilize_nchw(
const BufferType<T>& in_rowmajor,
const PhysicalSize& shape,
std::optional<PhysicalSize> tile_shape = std::nullopt) {
ZoneScoped;
std::vector<T> tilized_result;
if(in_rowmajor.size() == 0) {
return tilized_result;
}

uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
uint64_t batch_size = 1;
for (uint32_t i = 0; i < shape.size() - 2; i++) {
batch_size *= shape[i];
}
uint64_t input_volume = batch_size * H * W;
auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;
uint32_t OH = round_up_to_tile(H, tile_H);
uint32_t OW = round_up_to_tile(W, tile_W);
tilized_result.resize(batch_size * OH * OW);
std::fill(tilized_result.begin(), tilized_result.end(), 0);

TT_ASSERT(shape[0] % tile_H == 0 && shape[1] % tile_W == 0);

uint32_t H = shape[0];
uint32_t W = shape[1];

tilized_result.resize(H * W);
uint64_t out_index = 0;
for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
for (auto hs = 0; hs < H; hs += tile_H) {
for (auto ws = 0; ws < W; ws += tile_W) {
for (auto ht = 0; ht < tile_H; ht++) {
for (auto wt = 0; wt < tile_W; wt++) {
auto w = wt + ws;
auto h = ht + hs;
auto in_offs = w + h * W + batch_index * H * W;
auto val = (w >= W || h >= H || in_offs >= input_volume) ? 0 : in_rowmajor[in_offs];
auto out_w = (out_index % OW);
auto out_h = (out_index / OW) % OH;
TT_ASSERT(w < OW);
TT_ASSERT(h < OH);
auto out_offs = out_w + out_h * OW + batch_index * OH * OW;
tilized_result[out_offs] = val;
out_index++;
}
for (auto hs = 0; hs < H; hs += tile_H) {
for (auto ws = 0; ws < W; ws += tile_W) {
for (auto ht = 0; ht < tile_H; ht++) {
for (auto wt = 0; wt < tile_W; wt++) {
auto w = wt + ws;
auto h = ht + hs;
auto in_offs = w + h * W;
auto val = in_rowmajor[in_offs];
tilized_result[out_index] = val;
out_index++;
}
}
}
}
TT_ASSERT(tilized_result.size() == batch_size * OH * OW);

return tilized_result;
}
Expand All @@ -308,13 +295,13 @@ struct TensAddr {
template <typename T, template <typename...> typename BufferType>
inline std::vector<T> convert_layout(
const BufferType<T>& inp,
tt::stl::Span<const uint32_t> shape,
const PhysicalSize& shape,
tests::utils::TensorLayoutType inL,
tests::utils::TensorLayoutType outL,
std::optional<tt::stl::Span<const uint32_t>> tile_shape = std::nullopt,
std::optional<const tt::stl::Span<const uint32_t>> face_shape = std::nullopt,
const std::optional<bool>& transpose_within_face = std::nullopt,
const std::optional<bool>& transpose_of_faces = std::nullopt) {
std::optional<PhysicalSize> tile_shape = std::nullopt,
std::optional<PhysicalSize> face_shape = std::nullopt,
const bool transpose_within_face = false,
const bool transpose_of_faces = false) {
ZoneScoped;
if(inp.size() == 0) {
return std::vector<T>();
Expand All @@ -333,16 +320,18 @@ inline std::vector<T> convert_layout(
if (outL == tests::utils::TensorLayoutType::TILED_SWIZZLED) {
return tilize_nchw<T>(inp, shape, tile_shape);
} else if (outL == tests::utils::TensorLayoutType::TILED_NFACES) {
auto swiz32 = convert_layout<T>(inp, shape, inL, tests::utils::TensorLayoutType::TILED_SWIZZLED, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
return convert_layout<T>(swiz32, shape, tests::utils::TensorLayoutType::TILED_SWIZZLED, outL, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
auto swiz32 = tilize_nchw<T>(inp, shape, tile_shape);
return convert_to_tile_layout<T>(
swiz32, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
} else
TT_ASSERT(false && "Unsupported conversion.");
break;
case tests::utils::TensorLayoutType::TILED_NFACES:
if (outL == tests::utils::TensorLayoutType::TILED_SWIZZLED) {
return convert_to_flat_layout<T>(inp, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
} else if (outL == tests::utils::TensorLayoutType::LIN_ROW_MAJOR) {
auto swiz32 = convert_layout<T>(inp, shape, inL, tests::utils::TensorLayoutType::TILED_SWIZZLED, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
auto swiz32 =
convert_to_flat_layout<T>(inp, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
return untilize_nchw<T>(swiz32, shape, tile_shape);
} else {
TT_ASSERT(false && "Unsupported conversion");
Expand All @@ -353,3 +342,25 @@ inline std::vector<T> convert_layout(
}
return std::vector<T>();
}

template <typename T, template <typename...> typename BufferType>
inline std::vector<T> convert_layout(
const BufferType<T>& inp,
tt::stl::Span<const uint32_t> shape,
tests::utils::TensorLayoutType inL,
tests::utils::TensorLayoutType outL,
std::optional<PhysicalSize> tile_shape = std::nullopt,
std::optional<PhysicalSize> face_shape = std::nullopt,
const bool transpose_within_face = false,
const bool transpose_of_faces = false) {
ZoneScoped;

TT_ASSERT(shape.size() >= 2, "Shape size {} must be at least rank 2!", shape.size());
uint32_t H = shape[shape.size() - 2];
uint32_t W = shape[shape.size() - 1];
for (int i = 0; i < shape.size() - 2; i++) {
H *= shape[i];
}
return convert_layout<T, BufferType>(
inp, PhysicalSize{H, W}, inL, outL, tile_shape, face_shape, transpose_within_face, transpose_of_faces);
}
Loading

0 comments on commit ab3dc0c

Please sign in to comment.