Skip to content

Commit

Permalink
#16109: fix circular buffer overflow issue on transpose
Browse files Browse the repository at this point in the history
- replace the existing interleaved row-major transpose WH row major implementation with ttnn::prim::permute
- the current kernel takes up O(H*W) space per core, with support for only 16 element aligned inputs, resulting in CB overflows and conversions to tiled
- the new kernel takes up constant space, making it more reliable, though not as performant atm.
  • Loading branch information
sjameelTT committed Dec 18, 2024
1 parent 3ef6837 commit ed94ee6
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -655,18 +655,16 @@ def test_transpose_hc(dtype, shape, device):
)
@pytest.mark.parametrize(
"shape",
[(9216, 128), (1, 32), (1, 12), (1, 35), (16, 32), (34, 8)],
[(9216, 128), (1, 32), (1, 12), (1, 35), (16, 32), (34, 8), [21843, 768]],
)
@pytest.mark.parametrize(
"layout",
[ttnn.TILE_LAYOUT],
[ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
)
def test_transpose_2D(dtype, shape, layout, device):
torch.manual_seed(2005)
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Skipping float32 tests on Grayskull")
if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[-2] % 2):
pytest.skip("Skipping RM odd inner dim test cases")

torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(0, 1)
Expand Down
47 changes: 7 additions & 40 deletions ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "ttnn/decorators.hpp"
#include "device/transpose_op.hpp"
#include "ttnn/operations/data_movement/permute/permute.hpp"
#include "ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
#include "ttnn/operations/data_movement/transpose/transpose.hpp"
#include "ttnn/cpp/ttnn/operations/copy.hpp"
#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
Expand All @@ -19,38 +20,6 @@ namespace ttnn::operations::data_movement {

namespace detail {

inline uint32_t get_estimated_size_of_cbs(const Tensor& input_tensor_a) {
// Circular Buffer sizes:
uint32_t element_size = input_tensor_a.element_size();
uint32_t Wt = input_tensor_a.get_padded_shape()[-1] / tt::constants::TILE_WIDTH;
uint32_t Ht = input_tensor_a.get_padded_shape()[-2] / tt::constants::TILE_HEIGHT;
uint32_t HtWt = Ht * Wt;
auto data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor_a.get_dtype());
uint32_t tile_size = tt::tt_metal::detail::TileSize(data_format);

uint32_t cb_src0_size = 2 * Wt * tile_size;
uint32_t cb_output_size = 2 * Ht * tile_size;
uint32_t cb_im_size = Ht * Wt * tile_size;
uint32_t cb_im2_size = Ht * tile_size;
return cb_src0_size + cb_output_size + cb_im_size + cb_im2_size;
}

inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) {
tt::tt_metal::Device* device = input_tensor_a.device();
const std::vector<uint32_t>& bank_ids =
device->bank_ids_from_logical_core(BufferType::L1, *device->compute_cores_.begin());
std::optional<uint64_t> lowest_address = allocator::lowest_occupied_l1_address(*device->allocator_, bank_ids[0]);
uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core();
max_l1_space = max_l1_space - device->get_base_allocator_addr(HalMemType::L1);
return max_l1_space;
}

inline bool rm_enough_available_space(const Tensor& input_tensor_a) {
uint32_t max_l1_space = get_max_l1_space(input_tensor_a);
uint32_t estimated_size_of_cbs = get_estimated_size_of_cbs(input_tensor_a);
return max_l1_space > estimated_size_of_cbs;
}

inline Tensor transpose_(
const Tensor& a,
TransposeOpDim transpose_dim,
Expand Down Expand Up @@ -86,16 +55,14 @@ inline Tensor transpose_(
tiled_only = true; // CN only has a tiled implementation at the moment
break;
case TransposeOpDim::WH: // THIS NEEDS TO BE FIXED
if (((W * a.element_size()) % FACE_WIDTH != 0) || ((H * a.element_size()) % FACE_WIDTH != 0)) {
tiled_only = true;
} else if (a.device()->arch() == tt::ARCH::GRAYSKULL) {
if (a.device()->arch() == tt::ARCH::GRAYSKULL) {
tiled_only = a.shape()[-2] > 256; // hangs right now past this dimension, #13660 will turn it from a
// hang into a PCC issue for GS and improve perf for WH
} else if (
!a.is_sharded() && a.layout() == Layout::ROW_MAJOR &&
!rm_enough_available_space(
a)) { // rm is L1 intensive, if it overflows we can do tiled which allocates much smaller CBs
tiled_only = true;
} else if (!a.is_sharded() && a.layout() == Layout::ROW_MAJOR) { // rm is L1 intensive, if it overflows we
// can do tiled which allocates much
// smaller CBs
return ttnn::prim::permute(
a, ttnn::SmallVector<uint32_t>({0, 1, 3, 2}), output_mem_config, std::nullopt);
}
break;
default: break;
Expand Down

0 comments on commit ed94ee6

Please sign in to comment.