Skip to content

Commit

Permalink
Conv op non tile multiple shard width (#15742)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkpatel-tt authored Dec 5, 2024
1 parent 6996c9a commit d6c5a99
Show file tree
Hide file tree
Showing 24 changed files with 1,219 additions and 327 deletions.
1 change: 1 addition & 0 deletions tests/scripts/run_tt_eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
TestEntry("tt_eager/tests/ops/test_bcast_op", "ops/test_bcast_op"),
TestEntry("tt_eager/tests/ops/test_transpose_op", "ops/test_transpose_op"),
TestEntry("tt_eager/tests/ops/test_sliding_window_ops", "ops/test_sliding_window_ops"),
TestEntry("tt_eager/tests/ops/test_tensor_utils", "ops/test_tensor_utils"),
TestEntry("tt_eager/tests/ops/test_bmm_op", "ops/test_bmm_op"),
void_for_bh(void_for_whb0(TestEntry("tt_eager/tests/ops/test_eltwise_unary_op", "ops/test_eltwise_unary_op"))),
void_for_whb0(
Expand Down
1 change: 1 addition & 0 deletions tests/tt_eager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ set(TT_EAGER_TESTS_OPS
ops/test_sfpu.cpp
ops/test_sliding_window_ops.cpp
ops/test_fold_op.cpp
ops/test_tensor_utils.cpp
)

set(TT_EAGER_TESTS_TENSORS
Expand Down
483 changes: 483 additions & 0 deletions tests/tt_eager/ops/test_tensor_utils.cpp

Large diffs are not rendered by default.

86 changes: 86 additions & 0 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -2620,6 +2620,92 @@ def test_non_tile_multiple_height_conv_wh(
)


@skip_for_grayskull()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
(1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("enable_auto_formatting", [False])
def test_non_tile_multiple_width_conv_wh(
device,
use_program_cache,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
enable_auto_formatting,
):
run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
use_shallow_conv_variant=(input_channels == 16),
transpose_mcast=use_1d_systolic_array,
enable_auto_formatting=enable_auto_formatting,
padded_input_channels=16 if input_channels == 16 else None,
output_layout=ttnn.ROW_MAJOR_LAYOUT,
)


@skip_for_grayskull()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
def test_shallow_conv_with_tiled_input(device):
Expand Down
21 changes: 19 additions & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,17 @@ Result conv2d(
ttnn::is_tensor_on_device_or_multidevice(input_tensor) ? std::make_optional(input_tensor.memory_config()) : std::nullopt);
}

ShardOrientation shard_orientation =
conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
auto num_cores_c = shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x;
auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2;
bool is_non_tile_mul_width =
(conv_config.shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 &&
(conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) &&
conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0;

auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required(
device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv);
device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv, is_non_tile_mul_width);
if (tensor_manipulated) {
if (conv_config.deallocate_activation) {
ttnn::Tensor input_tensor_ = input_tensor; // TODO: allow in place modification of inputs to the op
Expand All @@ -96,6 +105,9 @@ Result conv2d(
uint32_t out_channels_padded = tt::round_up(
out_channels,
get_num_cores_channels_from_parallel_config(output_parallel_config) * tt::constants::TILE_WIDTH);
if(is_non_tile_mul_width) {
out_channels_padded = tt::round_up(out_channels, 32);
}
MemoryConfig conv_out_memory_config = create_sharded_memory_config_from_parallel_config(
ttnn::Shape(std::array<uint32_t, 4>{1, 1, nhw_out, out_channels_padded}),
output_parallel_config,
Expand All @@ -110,6 +122,9 @@ Result conv2d(
uint32_t in_channels_padded = tt::round_up(
in_channels,
get_num_cores_channels_from_parallel_config(parallel_config) * conv_config.input_channels_alignment);
if(is_non_tile_mul_width){
in_channels_padded = tt::round_up(in_channels, conv_config.input_channels_alignment);
}

uint32_t nhw_out_padded_ntile = get_num_cores_nhw_from_parallel_config(output_parallel_config) *
conv_out_memory_config.shard_spec.value().shape[0] / tt::constants::TILE_HEIGHT;
Expand Down Expand Up @@ -141,7 +156,9 @@ Result conv2d(
device,
groups,
opt_conv_op_block_config.act_block_h_ntiles,
input_width);
input_width,
true,
is_non_tile_mul_width);
}
// if 1x1 conv w/ stride 1, convert input tensor to tile layout if required
if (mm_conv) {
Expand Down
12 changes: 6 additions & 6 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,16 +321,16 @@ void py_bind_conv2d(py::module& module) {
py::arg("grid_size"),
py::arg("num_cores_nhw") = 1,
py::arg("num_cores_c") = 1,
py::arg("per_core_out_matrix_height_ntiles").noconvert() = 1,
py::arg("per_core_out_matrix_width_ntiles").noconvert() = 1)
py::arg("per_core_out_matrix_height").noconvert(),
py::arg("per_core_out_matrix_width").noconvert())
.def_property_readonly("grid_size", [](OptimizedConvParallelizationConfig const& c) { return c.grid_size; })
.def_property_readonly(
"num_cores_nhw", [](OptimizedConvParallelizationConfig const& c) { return c.num_cores_nhw; })
.def_property_readonly(
"per_core_out_matrix_height_ntiles",
[](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height_ntiles; })
.def_property_readonly("per_core_out_matrix_width_ntiles", [](OptimizedConvParallelizationConfig const& c) {
return c.per_core_out_matrix_width_ntiles;
"per_core_out_matrix_height",
[](OptimizedConvParallelizationConfig const& c) { return c.per_core_out_matrix_height; })
.def_property_readonly("per_core_out_matrix_width", [](OptimizedConvParallelizationConfig const& c) {
return c.per_core_out_matrix_width;
});

py::class_<OptimizedConvBlockConfig>(module, "OptimizedConvBlockConfig")
Expand Down
Loading

0 comments on commit d6c5a99

Please sign in to comment.