tenstorrent · blozano-tt · Dec 5, 2024 · Oct 11, 2024 · Nov 28, 2024 · Nov 30, 2024
diff --git a/tests/scripts/run_tt_eager.py b/tests/scripts/run_tt_eager.py
@@ -36,6 +36,7 @@
     TestEntry("tt_eager/tests/ops/test_bcast_op", "ops/test_bcast_op"),
     TestEntry("tt_eager/tests/ops/test_transpose_op", "ops/test_transpose_op"),
     TestEntry("tt_eager/tests/ops/test_sliding_window_ops", "ops/test_sliding_window_ops"),
+    TestEntry("tt_eager/tests/ops/test_tensor_utils", "ops/test_tensor_utils"),
     TestEntry("tt_eager/tests/ops/test_bmm_op", "ops/test_bmm_op"),
     void_for_bh(void_for_whb0(TestEntry("tt_eager/tests/ops/test_eltwise_unary_op", "ops/test_eltwise_unary_op"))),
     void_for_whb0(

diff --git a/tests/tt_eager/CMakeLists.txt b/tests/tt_eager/CMakeLists.txt
@@ -24,6 +24,7 @@ set(TT_EAGER_TESTS_OPS
     ops/test_sfpu.cpp
     ops/test_sliding_window_ops.cpp
     ops/test_fold_op.cpp
+    ops/test_tensor_utils.cpp
 )
 
 set(TT_EAGER_TESTS_TENSORS

diff --git a/tests/tt_eager/ops/test_tensor_utils.cpp b/tests/tt_eager/ops/test_tensor_utils.cpp
@@ -2620,6 +2620,92 @@ def test_non_tile_multiple_height_conv_wh(
     )
 
 
+@skip_for_grayskull()
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
+@pytest.mark.parametrize(
+    "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
+    (
+        (1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+    ),
+)
+@pytest.mark.parametrize(
+    "weights_dtype",
+    [ttnn.bfloat16, ttnn.bfloat8_b],
+)
+@pytest.mark.parametrize(
+    "activations_dtype",
+    [ttnn.bfloat16],
+)
+@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
+@pytest.mark.parametrize("enable_auto_formatting", [False])
+def test_non_tile_multiple_width_conv_wh(
+    device,
+    use_program_cache,
+    math_fidelity,
+    activations_dtype,
+    weights_dtype,
+    batch_size,
+    output_channels,
+    input_channels,
+    input_height,
+    input_width,
+    filter_height,
+    filter_width,
+    stride_h,
+    stride_w,
+    pad_h,
+    pad_w,
+    use_1d_systolic_array,
+    config_override,
+    enable_auto_formatting,
+):
+    run_conv(
+        device,
+        math_fidelity,
+        activations_dtype,
+        weights_dtype,
+        batch_size,
+        output_channels,
+        input_channels,
+        input_height,
+        input_width,
+        filter_height,
+        filter_width,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        use_1d_systolic_array,
+        config_override,
+        use_shallow_conv_variant=(input_channels == 16),
+        transpose_mcast=use_1d_systolic_array,
+        enable_auto_formatting=enable_auto_formatting,
+        padded_input_channels=16 if input_channels == 16 else None,
+        output_layout=ttnn.ROW_MAJOR_LAYOUT,
+    )
+
+
 @skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 def test_shallow_conv_with_tiled_input(device):

@@ -80,8 +80,17 @@ Result conv2d(
             ttnn::is_tensor_on_device_or_multidevice(input_tensor) ? std::make_optional(input_tensor.memory_config()) : std::nullopt);
     }
 
+    ShardOrientation shard_orientation =
+        conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
+    auto num_cores_c = shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x;
+    auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2;
+    bool is_non_tile_mul_width =
+        (conv_config.shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 &&
+        (conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) &&
+        conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0;
+
     auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required(
-        device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv);
+        device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv, is_non_tile_mul_width);
     if (tensor_manipulated) {
         if (conv_config.deallocate_activation) {
             ttnn::Tensor input_tensor_ = input_tensor;  // TODO: allow in place modification of inputs to the op
@@ -96,6 +105,9 @@ Result conv2d(
     uint32_t out_channels_padded = tt::round_up(
         out_channels,
         get_num_cores_channels_from_parallel_config(output_parallel_config) * tt::constants::TILE_WIDTH);
+    if(is_non_tile_mul_width) {
+        out_channels_padded = tt::round_up(out_channels, 32);
+    }
     MemoryConfig conv_out_memory_config = create_sharded_memory_config_from_parallel_config(
         ttnn::Shape(std::array<uint32_t, 4>{1, 1, nhw_out, out_channels_padded}),
         output_parallel_config,
@@ -110,6 +122,9 @@ Result conv2d(
     uint32_t in_channels_padded = tt::round_up(
         in_channels,
         get_num_cores_channels_from_parallel_config(parallel_config) * conv_config.input_channels_alignment);
+    if(is_non_tile_mul_width){
+        in_channels_padded = tt::round_up(in_channels, conv_config.input_channels_alignment);
+    }
 
     uint32_t nhw_out_padded_ntile = get_num_cores_nhw_from_parallel_config(output_parallel_config) *
                                     conv_out_memory_config.shard_spec.value().shape[0] / tt::constants::TILE_HEIGHT;
@@ -141,7 +156,9 @@ Result conv2d(
             device,
             groups,
             opt_conv_op_block_config.act_block_h_ntiles,
-            input_width);
+            input_width,
+            true,
+            is_non_tile_mul_width);
     }
     // if 1x1 conv w/ stride 1, convert input tensor to tile layout if required
     if (mm_conv) {

@@ -321,16 +321,16 @@ void py_bind_conv2d(py::module& module) {
             py::arg("grid_size"),
             py::arg("num_cores_nhw") = 1,
             py::arg("num_cores_c") = 1,
-            py::arg("per_core_out_matrix_height_ntiles").noconvert() = 1,
-            py::arg("per_core_out_matrix_width_ntiles").noconvert() = 1)
+            py::arg("per_core_out_matrix_height").noconvert(),
+            py::arg("per_core_out_matrix_width").noconvert())
         .def_property_readonly("grid_size", [](OptimizedConvParallelizationConfig const& c) { return c.grid_size; })
         .def_property_readonly(
             "num_cores_nhw", [](OptimizedConvParallelizationConfig const& c) { return c.num_cores_nhw; })
         .def_property_readonly(
-            "per_core_out_matrix_height_ntiles",
-            [](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height_ntiles; })
-        .def_property_readonly("per_core_out_matrix_width_ntiles", [](OptimizedConvParallelizationConfig const& c) {
-            return c.per_core_out_matrix_width_ntiles;
+            "per_core_out_matrix_height",
+            [](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height; })
+        .def_property_readonly("per_core_out_matrix_width", [](OptimizedConvParallelizationConfig const& c) {
+            return c.per_core_out_matrix_width;
         });
 
     py::class_<OptimizedConvBlockConfig>(module, "OptimizedConvBlockConfig")