Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nkpatel/conv op non tile multiple shard widht #15742

Merged
merged 14 commits into from
Dec 5, 2024
Merged
1 change: 1 addition & 0 deletions tests/scripts/run_tt_eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
TestEntry("tt_eager/tests/ops/test_bcast_op", "ops/test_bcast_op"),
TestEntry("tt_eager/tests/ops/test_transpose_op", "ops/test_transpose_op"),
TestEntry("tt_eager/tests/ops/test_sliding_window_ops", "ops/test_sliding_window_ops"),
TestEntry("tt_eager/tests/ops/test_tensor_utils", "ops/test_tensor_utils"),
TestEntry("tt_eager/tests/ops/test_bmm_op", "ops/test_bmm_op"),
void_for_bh(void_for_whb0(TestEntry("tt_eager/tests/ops/test_eltwise_unary_op", "ops/test_eltwise_unary_op"))),
void_for_whb0(
Expand Down
1 change: 1 addition & 0 deletions tests/tt_eager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ set(TT_EAGER_TESTS_OPS
ops/test_sfpu.cpp
ops/test_sliding_window_ops.cpp
ops/test_fold_op.cpp
ops/test_tensor_utils.cpp
)

set(TT_EAGER_TESTS_TENSORS
Expand Down
483 changes: 483 additions & 0 deletions tests/tt_eager/ops/test_tensor_utils.cpp

Large diffs are not rendered by default.

86 changes: 86 additions & 0 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -2620,6 +2620,92 @@ def test_non_tile_multiple_height_conv_wh(
)


@skip_for_grayskull()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
(1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("enable_auto_formatting", [False])
def test_non_tile_multiple_width_conv_wh(
device,
use_program_cache,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
enable_auto_formatting,
):
run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
use_shallow_conv_variant=(input_channels == 16),
transpose_mcast=use_1d_systolic_array,
enable_auto_formatting=enable_auto_formatting,
padded_input_channels=16 if input_channels == 16 else None,
output_layout=ttnn.ROW_MAJOR_LAYOUT,
)


@skip_for_grayskull()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
def test_shallow_conv_with_tiled_input(device):
Expand Down
21 changes: 19 additions & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,17 @@ Result conv2d(
ttnn::is_tensor_on_device_or_multidevice(input_tensor) ? std::make_optional(input_tensor.memory_config()) : std::nullopt);
}

ShardOrientation shard_orientation =
conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
auto num_cores_c = shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x;
auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2;
bool is_non_tile_mul_width =
(conv_config.shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 &&
(conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) &&
conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0;

auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required(
device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv);
device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv, is_non_tile_mul_width);
if (tensor_manipulated) {
if (conv_config.deallocate_activation) {
ttnn::Tensor input_tensor_ = input_tensor; // TODO: allow in place modification of inputs to the op
Expand All @@ -96,6 +105,9 @@ Result conv2d(
uint32_t out_channels_padded = tt::round_up(
out_channels,
get_num_cores_channels_from_parallel_config(output_parallel_config) * tt::constants::TILE_WIDTH);
if(is_non_tile_mul_width) {
out_channels_padded = tt::round_up(out_channels, 32);
}
MemoryConfig conv_out_memory_config = create_sharded_memory_config_from_parallel_config(
ttnn::Shape(std::array<uint32_t, 4>{1, 1, nhw_out, out_channels_padded}),
output_parallel_config,
Expand All @@ -110,6 +122,9 @@ Result conv2d(
uint32_t in_channels_padded = tt::round_up(
in_channels,
get_num_cores_channels_from_parallel_config(parallel_config) * conv_config.input_channels_alignment);
if(is_non_tile_mul_width){
in_channels_padded = tt::round_up(in_channels, conv_config.input_channels_alignment);
}

uint32_t nhw_out_padded_ntile = get_num_cores_nhw_from_parallel_config(output_parallel_config) *
conv_out_memory_config.shard_spec.value().shape[0] / tt::constants::TILE_HEIGHT;
Expand Down Expand Up @@ -141,7 +156,9 @@ Result conv2d(
device,
groups,
opt_conv_op_block_config.act_block_h_ntiles,
input_width);
input_width,
true,
is_non_tile_mul_width);
}
// if 1x1 conv w/ stride 1, convert input tensor to tile layout if required
if (mm_conv) {
Expand Down
12 changes: 6 additions & 6 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,16 +321,16 @@ void py_bind_conv2d(py::module& module) {
py::arg("grid_size"),
py::arg("num_cores_nhw") = 1,
py::arg("num_cores_c") = 1,
py::arg("per_core_out_matrix_height_ntiles").noconvert() = 1,
py::arg("per_core_out_matrix_width_ntiles").noconvert() = 1)
py::arg("per_core_out_matrix_height").noconvert(),
py::arg("per_core_out_matrix_width").noconvert())
.def_property_readonly("grid_size", [](OptimizedConvParallelizationConfig const& c) { return c.grid_size; })
.def_property_readonly(
"num_cores_nhw", [](OptimizedConvParallelizationConfig const& c) { return c.num_cores_nhw; })
.def_property_readonly(
"per_core_out_matrix_height_ntiles",
[](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height_ntiles; })
.def_property_readonly("per_core_out_matrix_width_ntiles", [](OptimizedConvParallelizationConfig const& c) {
return c.per_core_out_matrix_width_ntiles;
"per_core_out_matrix_height",
[](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height; })
.def_property_readonly("per_core_out_matrix_width", [](OptimizedConvParallelizationConfig const& c) {
return c.per_core_out_matrix_width;
});

py::class_<OptimizedConvBlockConfig>(module, "OptimizedConvBlockConfig")
Expand Down
Loading
Loading