Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Nilaykumar Patel <[email protected]>
  • Loading branch information
nkpatel-tt committed Nov 26, 2024
1 parent b97e74f commit 20db38a
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 27 deletions.
38 changes: 22 additions & 16 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -2626,17 +2626,33 @@ def test_non_tile_multiple_height_conv_wh(
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
(1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
(1, 512, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 512, 512, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16],
[ttnn.bfloat16, ttnn.bfloat8_b],
)
@pytest.mark.parametrize(
"activations_dtype",
Expand Down Expand Up @@ -2665,16 +2681,6 @@ def test_non_tile_multiple_width_conv_wh(
config_override,
enable_auto_formatting,
):
# Skip test cases raising OOM, but do not affect the SD e2e test
if (input_channels == 960 and config_override == None and fp32_accum == True) or (
output_channels == 1280
and input_height == 32
and activations_dtype == ttnn.bfloat16
and weights_dtype == ttnn.bfloat16
and enable_auto_formatting == False
):
pytest.skip("Skip the test cases raising OOM but not affecting e2e test")

run_conv(
device,
math_fidelity,
Expand All @@ -2694,7 +2700,7 @@ def test_non_tile_multiple_width_conv_wh(
use_1d_systolic_array,
config_override,
use_shallow_conv_variant=(input_channels == 16),
transpose_mcast=use_1d_systolic_array, ## use RM (transpose_mcast=False) with 2D on WH
transpose_mcast=use_1d_systolic_array,
enable_auto_formatting=enable_auto_formatting,
padded_input_channels=16 if input_channels == 16 else None,
output_layout=ttnn.ROW_MAJOR_LAYOUT,
Expand Down
6 changes: 3 additions & 3 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ ParallelConfig determine_parallel_config(
block_shard_orientation == ShardOrientation::COL_MAJOR ? compute_grid_size.x : compute_grid_size.y;
auto channels_per_core = std::ceil((float)input_channels / effective_tile_width);
if(is_non_tile_mul_width) {
out_nhw_ntiles = tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / tt::constants::TILE_HEIGHT;
out_c_ntiles = tt::round_up(output_channels, effective_tile_width) / 1;
out_c_ntiles = tt::round_up(output_channels, effective_tile_width);
channels_per_core = input_channels;
}
num_cores_nhw = find_closest_largest_divisor_with_num_padding(out_nhw_ntiles, start_divisor);
Expand Down Expand Up @@ -455,8 +454,9 @@ std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_sh
auto block_shard_orientation =
conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
auto num_cores_c = block_shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x;
auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2;
bool is_non_tile_mul_width = (shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 &&
(conv_config.dtype == DataType::BFLOAT16 || conv_config.dtype == DataType::FLOAT32) && conv_config.output_layout == Layout::ROW_MAJOR && ((2*in_channels) % (16 * num_cores_c)) == 0;
conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size*in_channels) % (16 * num_cores_c)) == 0;
ParallelConfig optimal_parallel_config = determine_parallel_config(
shard_layout, batch_size, in_channels, height, width, out_channels, device->compute_with_storage_grid_size(), block_shard_orientation, !use_non_tile_height, is_non_tile_mul_width);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,26 @@ void copy_sticks_async(
if constexpr (is_read) {
uint32_t dst_addr = out_base_l1_addr + dst_offset;
uint64_t src_addr = base_addr + src_offset;
for(uint16_t k = 0; k < nsticks; k++) {
noc_async_read(src_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
src_addr += input_aligned_page_size;
if (stick_nbytes == input_aligned_page_size) {
noc_async_read(src_addr, dst_addr, size);
} else {
for (uint16_t k = 0; k < nsticks; k++) {
noc_async_read(src_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
src_addr += input_aligned_page_size;
}
}
} else {
uint64_t dst_addr = base_addr + dst_offset;
uint32_t src_addr = in_base_l1_addr + src_offset;
for(uint16_t k = 0; k < nsticks; k++) {
noc_async_write(src_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
src_addr += input_aligned_page_size;
if (stick_nbytes == input_aligned_page_size) {
noc_async_write(src_addr, dst_addr, size);
} else {
for (uint16_t k = 0; k < nsticks; k++) {
noc_async_write(src_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
src_addr += input_aligned_page_size;
}
}
}
}
Expand Down

0 comments on commit 20db38a

Please sign in to comment.