#0: fix if sharding width is not multiple of 16.

tenstorrent · Apr 24, 2024 · 5a81404 · 5a81404
1 parent a44e9c9
commit 5a81404
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 3 deletions.
diff --git a/tests/ttnn/unit_tests/operations/test_silu_row_major.py b/tests/ttnn/unit_tests/operations/test_silu_row_major.py
@@ -93,6 +93,8 @@ def test_silu_multi_core(device, input_shape, shard_strategy):
         shard_height = math.ceil(batch_size * height * width / ncores)
         shard_width = num_channels
 
+    if (shard_height * shard_width) % 1024 != 0:
+        pytest.skip("Shard sizes that are not multiples of 1024 are not supported.")
     shard_shape = (shard_height, shard_width)
     logger.debug(f"shard_shape={shard_shape}")
     shard_spec = ttnn.experimental.tensor.ShardSpec(shard_grid, shard_shape, shard_orientation, False)

diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp
@@ -35,11 +35,14 @@ operation::ProgramWithCallbacks eltwise_unary_sharded(const Tensor &input, Tenso
     uint32_t output_tile_size = tt::tt_metal::detail::TileSize(out_df);
 
     TT_FATAL(input_tile_size == output_tile_size, "Input and output tile size should be same");
-    uint32_t shard_size_in_bytes = shard_spec.numel() * datum_size(act_df);
+    size_t shard_height = shard_spec.shape[0];
 
-    uint32_t num_tile_per_core = (shard_size_in_bytes + input_tile_size - 1) / input_tile_size; //ceil value
-    TT_FATAL(input_tile_size <= shard_size_in_bytes, "Input tile size should be less than shard size");
+    TT_FATAL((shard_spec.shape[1] * datum_size(act_df)) % L1_ALIGNMENT == 0, "Shard width should be multiple of L1_ADRESS_ALIGNMENT");
+    size_t shard_width = round_up_to_mul16(shard_spec.shape[1]); // rounding up is done to aligned with  --> tt-metal/tt_metal/detail/util.hpp:31
+    size_t shard_size_in_bytes = shard_height * shard_width * datum_size(act_df);
 
+    uint32_t num_tile_per_core = (shard_size_in_bytes + input_tile_size - 1) / input_tile_size; //ceil value
+    TT_FATAL(shard_size_in_bytes % input_tile_size == 0, "Shard Size must be multiple of input_tile_size");
 
     uint32_t in_cb_id = CB::c_in0;
     uint32_t buffering_factor = 1;  // data is already fully buffered in the CBs since its sharded

diff --git a/tt_metal/common/test_tiles.hpp b/tt_metal/common/test_tiles.hpp
@@ -117,6 +117,8 @@ inline std::vector<T> untilize_nchw(const BufferType<T>& in, const std::vector<s
     return result;
 }
 
+inline std::uint32_t round_up_to_mul16(std::uint32_t val) { return ((val & 15) == 0) ? val : (val | 15)+1; }
+
 inline std::uint32_t round_up_to_mul32(std::uint32_t val) { return ((val & 31) == 0) ? val : (val | 31)+1; }
 
 // Converts a linear non-zero-padded row-major tensor to zero-padded-32 32-swizzled tilized row-major tensor