Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#0: Fix failing test case for width sharded non-32 multiple output width #16224

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1623,7 +1623,6 @@ def test_conv2d_localrun(device, input_spec):
[1, 1056, 1056, 96, 96, 3, 3, 2, 2, 1, 1, 4, False, 1], # 127
[1, 528, 528, 192, 192, 3, 3, 2, 2, 1, 1, 2, False, 1], # 220
[1, 2904, 2904, 48, 48, 3, 3, 2, 2, 1, 1, 11, False, 1], # 294
[1, 3024, 1232, 14, 14, 1, 1, 2, 2, 0, 0, 1, False, 1], # 1421
[1, 819, 256, 100, 136, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1443
[1, 819, 256, 50, 68, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1447
[1, 1024, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1], # 1458
Expand Down
4 changes: 1 addition & 3 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,20 +530,18 @@ def test_conv_features_multi_device(
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, pad_h, pad_w, act_block_w_div",
(
(2, 128, 128, 9, 9, 3, 3, 0, 0, 1),
pavlejosipovic marked this conversation as resolved.
Show resolved Hide resolved
(2, 128, 256, 9, 9, 3, 3, 1, 1, 1),
(2, 576, 576, 9, 9, 3, 3, 0, 0, 1),
(2, 960, 960, 5, 5, 3, 3, 0, 0, 1),
(2, 256, 2048, 9, 9, 3, 3, 1, 1, 1),
(2, 512, 2048, 17, 17, 3, 3, 1, 1, 1),
(2, 768, 768, 17, 17, 3, 3, 0, 0, 1),
(2, 1280, 2560, 15, 15, 3, 3, 1, 1, 2),
(2, 1280, 2560, 15, 15, 3, 3, 0, 0, 2),
(2, 1280, 1280, 17, 17, 3, 3, 1, 1, 1),
[1, 3024, 1232, 14, 14, 1, 1, 0, 0, 1],
(2, 768, 32, 9, 9, 3, 3, 1, 1, 1),
(2, 64, 128, 9, 9, 3, 3, 1, 1, 1),
(2, 32, 128, 9, 9, 3, 3, 1, 1, 1),
(1, 256, 256, 7, 7, 3, 3, 1, 1, 1),
),
)
@pytest.mark.parametrize(
Expand Down
6 changes: 4 additions & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ Result conv2d(
const std::optional<const Conv2dConfig>& conv_config_,
const std::optional<const DeviceComputeKernelConfig>& compute_config_,
const std::optional<const MemoryConfig>& memory_config) {
const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups);
Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups, conv_config);
const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1;
const uint32_t output_width =
((input_width - kernel_size[1] - ((kernel_size[0] - 1) * (dilation[0] - 1)) + 2 * padding[1]) / stride[1]) + 1;

Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
const auto compute_grid_size = device->compute_with_storage_grid_size();

bool auto_shard = false;
Expand Down Expand Up @@ -155,6 +155,7 @@ Result conv2d(
opt_conv_op_block_config.act_block_w_ntiles,
opt_conv_op_block_config.out_subblock_w_ntiles,
parallel_config,
output_parallel_config,
device,
groups,
opt_conv_op_block_config.act_block_h_ntiles,
Expand Down Expand Up @@ -191,6 +192,7 @@ Result conv2d(

if (bypass_halo) {
if (input_tensor_post_tm.layout() == Layout::TILE) {
input_tensor_post_tm = ttnn::reshape(input_tensor_post_tm, input_tensor_post_tm.get_padded_shape());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this reshape here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to_layout was not using the padded shape, and was instead using the logical shape. This was causing an error.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least we should log an issue with unit tests and reference it here.
Would be better if we can just fix to layout.

Halo with untialize is doing this properly?

input_tensor_post_tm = ttnn::to_layout(
input_tensor_post_tm, Layout::ROW_MAJOR, std::nullopt, std::nullopt, device);
}
Expand Down
8 changes: 5 additions & 3 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ ParallelConfig determine_parallel_config(
return pconfig;
}

static ParallelConfig determine_output_parallel_config(
ParallelConfig determine_output_parallel_config(
const ParallelConfig& input_parallel_config,
const CoreCoord& compute_grid_size,
uint32_t out_channels,
Expand Down Expand Up @@ -393,9 +393,11 @@ bool use_matmul_for_1x1_conv(
const std::array<uint32_t, 2>& stride,
const std::array<uint32_t, 2>& padding,
const std::array<uint32_t, 2>& dilation,
uint32_t groups) {
uint32_t groups,
const Conv2dConfig& conv_config) {
bool is_width_sharded = (conv_config.shard_layout.has_value() && conv_config.shard_layout.value() == TensorMemoryLayout::WIDTH_SHARDED);
return kernel_size[0] == 1 && kernel_size[1] == 1 && stride[0] == stride[1] && stride[0] == 1 && padding[0] == 0 &&
padding[1] == 0 && dilation[0] == 1 && dilation[1] == 1 && groups == 1;
padding[1] == 0 && dilation[0] == 1 && dilation[1] == 1 && groups == 1 && (not is_width_sharded);
}

// Implements a heuristic for selecting shard layout based on how many tenix cores are available
Expand Down
9 changes: 8 additions & 1 deletion ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ bool use_matmul_for_1x1_conv(
const std::array<uint32_t, 2>& stride,
const std::array<uint32_t, 2>& padding,
const std::array<uint32_t, 2>& dilation,
uint32_t groups);
uint32_t groups,
const Conv2dConfig& conv_config);

sliding_window::ParallelConfig determine_parallel_config(
const TensorMemoryLayout shard_layout,
Expand All @@ -108,6 +109,12 @@ sliding_window::ParallelConfig determine_parallel_config(
bool enable_channels_padding,
bool is_out_tiled=true);

sliding_window::ParallelConfig determine_output_parallel_config(
const sliding_window::ParallelConfig& input_parallel_config,
const CoreCoord& compute_grid_size,
uint32_t out_channels,
bool is_mm_conv);

uint32_t get_num_cores_nhw_from_parallel_config(const sliding_window::ParallelConfig& pconfig);

uint32_t get_num_cores_channels_from_parallel_config(const sliding_window::ParallelConfig& pconfig);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
uint32_t num_groups = num_blocks_act_h * num_blocks_act_w * num_blocks_weight_w;
// writer of conv op partially removes padding on the width
// it removes the padding done for block width but it doesn't remove padding done for tiled width
uint32_t output_channels_padded_to_tile_width = round_up(output_channels, input_num_cores * TILE_WIDTH);
uint32_t output_channels_padded_to_tile_width = round_up(output_channels, output_num_cores * TILE_WIDTH);
TT_FATAL(
output_channels_padded_to_tile_width <= weight_matrix_width,
"output_channels_padded_to_tile_width {} should be less than or equal to weight_matrix_width {}",
Expand Down
46 changes: 31 additions & 15 deletions ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ ttnn::Tensor conv_bias_layout_convert(
validate_bias_tensor(bias_tensor_);
if (!is_non_tile_mul_width) {
auto bias_shape = bias_tensor_.get_shape();
TT_FATAL(bias_shape[3] == out_channels && bias_shape[0] == 1 && bias_shape[1] == 1 && bias_shape[2] == 1, "bias shape is not correct");
TT_FATAL(bias_shape[0] == 1 && bias_shape[1] == 1 && bias_shape[2] == 1, "bias shape is not correct");
pavlejosipovic marked this conversation as resolved.
Show resolved Hide resolved
tt::tt_metal::LegacyShape bias_channels_padded_shape = tt::tt_metal::LegacyShape(
std::array<uint32_t, 4>({1, 1, 32, round_up(out_channels, weight_block_w_ntiles * 32)}));
bias_tensor_ = ttnn::pad(bias_tensor_, bias_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D{0, 0, 0, 0}, 0);
Expand Down Expand Up @@ -188,7 +188,8 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
DataType weights_bias_dtype,
uint32_t weight_block_h_ntiles,
uint32_t weight_block_w_ntiles,
const ParallelConfig& parallel_config,
const ParallelConfig& input_parallel_config,
const ParallelConfig& output_parallel_config,
T * device,
uint32_t groups,
uint32_t act_block_h_ntiles,
Expand Down Expand Up @@ -231,9 +232,11 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
uint32_t window_h = weights_shape[2];
uint32_t window_w = weights_shape[3];

uint32_t num_cores_channels = get_num_cores_channels_from_parallel_config(parallel_config);
uint32_t out_channels_padded = tt::round_up(out_channels, num_cores_channels * tt::constants::TILE_WIDTH);
uint32_t in_channels_padded = tt::round_up(in_channels, num_cores_channels * input_channels_alignment);
uint32_t input_num_cores_channels = get_num_cores_channels_from_parallel_config(input_parallel_config);
uint32_t output_num_cores_channels = get_num_cores_channels_from_parallel_config(output_parallel_config);

uint32_t out_channels_padded = tt::round_up(out_channels, output_num_cores_channels * tt::constants::TILE_WIDTH);
uint32_t in_channels_padded = tt::round_up(in_channels, input_num_cores_channels * input_channels_alignment);
uint32_t out_channel_padding = out_channels_padded - out_channels;

tt::tt_metal::LegacyShape weights_channels_padded_shape = tt::tt_metal::LegacyShape(std::array<uint32_t, 4>(
Expand All @@ -258,12 +261,12 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
weight_tensor_ = ttnn::pad(weight_tensor_, weights_channels_padded_shape.to_array_4D(), tt::tt_metal::Array4D({0, 0, 0, 0}), 0);

// for conv op, pad the weights to block shape
if (parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) {
if (input_parallel_config.shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) {
weight_tensor_ = tt::tt_metal::convert_conv_weight_tensor_to_special_padding_tiled_layout(
weight_tensor_, weight_block_h_ntiles, weight_block_w_ntiles, weights_bias_dtype);
} else if(parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) {
} else if(input_parallel_config.shard_scheme == TensorMemoryLayout::BLOCK_SHARDED) {
weight_tensor_ = tt::tt_metal::convert_conv_weight_tensor_to_tiled_layout_block_sharded(
weight_tensor_, num_cores_channels, weights_bias_dtype);
weight_tensor_, input_num_cores_channels, weights_bias_dtype);
} else {
weight_tensor_ = tt::tt_metal::convert_conv_weight_tensor_to_tiled_layout(
weight_tensor_, weight_block_h_ntiles, weight_block_w_ntiles, weights_bias_dtype);
Expand All @@ -289,7 +292,8 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
bias_tensor_ = bias_tensor.value();
bool is_bias_tensor_is_on_device = ttnn::is_tensor_on_device_or_multidevice(bias_tensor_);
if(!is_bias_tensor_is_on_device) {
bias_tensor_ = conv_bias_layout_convert(bias_tensor_, weights_bias_dtype, weight_block_h_ntiles, weight_block_w_ntiles, parallel_config, device, out_channels, is_non_tile_mul_width);
TT_FATAL(bias_tensor_.shape()[3]==out_channels, "Bias must have the same length as output channels");
bias_tensor_ = conv_bias_layout_convert(bias_tensor_, weights_bias_dtype, weight_block_h_ntiles, weight_block_w_ntiles, output_parallel_config, device, out_channels_padded, is_non_tile_mul_width);
bias_tensor_ = ttnn::operations::core::to_device(bias_tensor_, device, std::nullopt);
}
}
Expand Down Expand Up @@ -326,7 +330,7 @@ ttnn::Tensor prepare_conv_weights(
false,
false
));
const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups);
const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups, conv_config);
const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1;
const uint32_t output_width =
((input_width - kernel_size[1] - ((kernel_size[0] - 1) * (dilation[0] - 1)) + 2 * padding[1]) / stride[1]) + 1;
Expand Down Expand Up @@ -365,6 +369,9 @@ ttnn::Tensor prepare_conv_weights(
shard_orientation,
!use_non_tile_height);

ParallelConfig output_parallel_config =
determine_output_parallel_config(parallel_config, device->compute_with_storage_grid_size(), out_channels, mm_conv);

bool is_non_tile_mul_width = check_non_tile_mul_width(device, conv_config, in_channels);
std::optional<const ttnn::Tensor> bias_tensor = std::nullopt;
ttnn::Tensor weight_tensor_on_device = weight_tensor;
Expand All @@ -377,6 +384,7 @@ ttnn::Tensor prepare_conv_weights(
opt_conv_op_block_config.act_block_w_ntiles,
opt_conv_op_block_config.out_subblock_w_ntiles,
parallel_config,
output_parallel_config,
device,
groups,
opt_conv_op_block_config.act_block_h_ntiles,
Expand Down Expand Up @@ -408,12 +416,13 @@ ttnn::Tensor prepare_conv_bias(

TT_FATAL(!ttnn::is_tensor_on_device_or_multidevice(bias_tensor), "Error: bias tensor must be on host for preparation.");

const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups);
Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());

const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups, conv_config);
const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1;
const uint32_t output_width =
((input_width - kernel_size[1] - ((kernel_size[0] - 1) * (dilation[0] - 1)) + 2 * padding[1]) / stride[1]) + 1;

Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
DeviceComputeKernelConfig compute_config = compute_config_.value_or(init_device_compute_kernel_config(
device->arch(),
std::nullopt,
Expand Down Expand Up @@ -458,14 +467,19 @@ ttnn::Tensor prepare_conv_bias(
shard_orientation,
!use_non_tile_height);

ParallelConfig output_parallel_config =
determine_output_parallel_config(parallel_config, device->compute_with_storage_grid_size(), out_channels, mm_conv);

bool is_non_tile_mul_width = check_non_tile_mul_width(device, conv_config, in_channels);
ttnn::Tensor bias_tensor_ = bias_tensor;
TT_FATAL(bias_tensor_.shape()[3]==out_channels, "Bias must have the same length as output channels");

bias_tensor_ = conv_bias_layout_convert(
bias_tensor_,
conv_config.weights_dtype,
opt_conv_op_block_config.act_block_h_ntiles,
weight_block_w_ntiles,
parallel_config,
output_parallel_config,
device,
out_channels,
is_non_tile_mul_width
Expand Down Expand Up @@ -550,7 +564,8 @@ template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weigh
DataType weights_bias_dtype,
uint32_t weight_block_h_ntiles,
uint32_t weight_block_w_ntiles,
const ParallelConfig& parallel_config,
const ParallelConfig& input_parallel_config,
const ParallelConfig& output_parallel_config,
Device* device,
uint32_t groups,
uint32_t act_block_h_ntiles,
Expand All @@ -565,7 +580,8 @@ template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weigh
DataType weights_bias_dtype,
uint32_t weight_block_h_ntiles,
uint32_t weight_block_w_ntiles,
const ParallelConfig& parallel_config,
const ParallelConfig& input_parallel_config,
const ParallelConfig& output_parallel_config,
MeshDevice* device,
uint32_t groups,
uint32_t act_block_h_ntiles,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases
DataType weights_bias_dtype,
uint32_t weight_block_h_ntiles,
uint32_t weight_block_w_ntiles,
const sliding_window::ParallelConfig& parallel_config,
const sliding_window::ParallelConfig& input_parallel_config,
const sliding_window::ParallelConfig& output_parallel_config,
T * device,
uint32_t groups,
uint32_t act_block_h_ntiles,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ Result conv_transpose2d(

log_debug(LogOp, "Padding : ({},{}) ({},{})", input_pad_top, input_pad_bottom, input_pad_left, input_pad_right);

const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, {1, 1}, {input_pad_top + input_pad_bottom, input_pad_left + input_pad_right}, dilation, groups);
const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, {1, 1}, {input_pad_top + input_pad_bottom, input_pad_left + input_pad_right}, dilation, groups, conv_config);

const auto compute_grid_size = device->compute_with_storage_grid_size();

Expand Down Expand Up @@ -281,6 +281,7 @@ Result conv_transpose2d(
opt_conv_op_block_config.act_block_w_ntiles,
opt_conv_op_block_config.out_subblock_w_ntiles,
parallel_config,
output_parallel_config,
device,
groups,
opt_conv_op_block_config.act_block_h_ntiles,
Expand Down
Loading