diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 8341540c5ef3..6944b86f4a2d 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -2626,17 +2626,33 @@ def test_non_tile_multiple_height_conv_wh( @pytest.mark.parametrize( "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override", ( + (1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None), + (1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None), (1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None), - (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None), - (1, 512, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None), - (1, 512, 512, 32, 32, 3, 3, 1, 1, 1, 1, False, None), (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None), - (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), ), ) @pytest.mark.parametrize( "weights_dtype", - [ttnn.bfloat16], + [ttnn.bfloat16, ttnn.bfloat8_b], ) @pytest.mark.parametrize( "activations_dtype", @@ -2665,16 +2681,6 @@ def test_non_tile_multiple_width_conv_wh( config_override, enable_auto_formatting, ): - # Skip test cases raising OOM, but do not affect the SD e2e test - if (input_channels == 960 and config_override == None and fp32_accum == True) or ( - output_channels == 1280 - and input_height == 32 - and activations_dtype == ttnn.bfloat16 - and weights_dtype == ttnn.bfloat16 - and enable_auto_formatting == False - ): - pytest.skip("Skip the test cases raising OOM but not affecting e2e test") - run_conv( device, math_fidelity, @@ -2694,7 +2700,7 @@ def test_non_tile_multiple_width_conv_wh( use_1d_systolic_array, config_override, use_shallow_conv_variant=(input_channels == 16), - transpose_mcast=use_1d_systolic_array, ## use RM (transpose_mcast=False) with 2D on WH + transpose_mcast=use_1d_systolic_array, enable_auto_formatting=enable_auto_formatting, padded_input_channels=16 if input_channels == 16 else None, output_layout=ttnn.ROW_MAJOR_LAYOUT, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index 4af7cb3d4ec9..27a0a5e9366e 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -106,8 +106,7 @@ ParallelConfig determine_parallel_config( block_shard_orientation == ShardOrientation::COL_MAJOR ? compute_grid_size.x : compute_grid_size.y; auto channels_per_core = std::ceil((float)input_channels / effective_tile_width); if(is_non_tile_mul_width) { - out_nhw_ntiles = tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / tt::constants::TILE_HEIGHT; - out_c_ntiles = tt::round_up(output_channels, effective_tile_width) / 1; + out_c_ntiles = tt::round_up(output_channels, effective_tile_width); channels_per_core = input_channels; } num_cores_nhw = find_closest_largest_divisor_with_num_padding(out_nhw_ntiles, start_divisor); @@ -455,8 +454,9 @@ std::tuple get_conv_padded_input_sh auto block_shard_orientation = conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; auto num_cores_c = block_shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x; + auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2; bool is_non_tile_mul_width = (shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 && - (conv_config.dtype == DataType::BFLOAT16 || conv_config.dtype == DataType::FLOAT32) && conv_config.output_layout == Layout::ROW_MAJOR && ((2*in_channels) % (16 * num_cores_c)) == 0; + conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size*in_channels) % (16 * num_cores_c)) == 0; ParallelConfig optimal_parallel_config = determine_parallel_config( shard_layout, batch_size, in_channels, height, width, out_channels, device->compute_with_storage_grid_size(), block_shard_orientation, !use_non_tile_height, is_non_tile_mul_width); diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp index 27de632f87d3..a9b7697ded09 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp @@ -75,18 +75,26 @@ void copy_sticks_async( if constexpr (is_read) { uint32_t dst_addr = out_base_l1_addr + dst_offset; uint64_t src_addr = base_addr + src_offset; - for(uint16_t k = 0; k < nsticks; k++) { - noc_async_read(src_addr, dst_addr, stick_nbytes); - dst_addr += stick_nbytes; - src_addr += input_aligned_page_size; + if (stick_nbytes == input_aligned_page_size) { + noc_async_read(src_addr, dst_addr, size); + } else { + for (uint16_t k = 0; k < nsticks; k++) { + noc_async_read(src_addr, dst_addr, stick_nbytes); + dst_addr += stick_nbytes; + src_addr += input_aligned_page_size; + } } } else { uint64_t dst_addr = base_addr + dst_offset; uint32_t src_addr = in_base_l1_addr + src_offset; - for(uint16_t k = 0; k < nsticks; k++) { - noc_async_write(src_addr, dst_addr, stick_nbytes); - dst_addr += stick_nbytes; - src_addr += input_aligned_page_size; + if (stick_nbytes == input_aligned_page_size) { + noc_async_write(src_addr, dst_addr, size); + } else { + for (uint16_t k = 0; k < nsticks; k++) { + noc_async_write(src_addr, dst_addr, stick_nbytes); + dst_addr += stick_nbytes; + src_addr += input_aligned_page_size; + } } } }