diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 004753e50e60..d659fce60b2b 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -5,10 +5,12 @@ #include #include #include +#include #include "conv2d_utils.hpp" #include "common/constants.hpp" #include "impl/buffers/buffer_constants.hpp" +#include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/pool/downsample/device/downsample_op.hpp" @@ -16,6 +18,7 @@ #include "tt_metal/common/work_split.hpp" #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp" #include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp" +#include "ttnn/operations/sliding_window/sliding_window.hpp" #include "ttnn/tensor/tensor.hpp" #include "tt_metal/common/core_coord.hpp" @@ -397,6 +400,7 @@ bool use_matmul_for_1x1_conv( padding[1] == 0 && dilation[0] == 1 && dilation[1] == 1 && groups == 1; } + // Implements a heuristic for selecting shard layout based on how many tenix cores are available // for each shard. static TensorMemoryLayout select_shard_spec( @@ -410,6 +414,20 @@ static TensorMemoryLayout select_shard_spec( uint32_t input_width, ShardOrientation shard_orientation, const CoreCoord& compute_grid_size) { + auto get_conv2d_resources_for_shard_scheme = [&](TensorMemoryLayout shard_layout)->std::tuple { + auto pconfig = determine_parallel_config( + shard_layout, + batch_size, + in_channels, + output_height, + output_width, + out_channels, + compute_grid_size, + shard_orientation, + is_mm_conv); + + return {0, 0}; + }; auto get_core_count_for_sharding = [&](TensorMemoryLayout shard_layout) { return determine_parallel_config( shard_layout, @@ -875,6 +893,297 @@ void adjust_conv_op_config_for_auto_shard_if_necessary( } } +std::pair conv2d::estimate_L1_usage( + tt::ARCH arch, TensorMemoryLayout shard_layout, + const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype, + const SlidingWindowConfig& sliding_window_config, const DeviceComputeKernelConfig& compute_kernel_config, + const OptimizedConvBlockConfig& block_config, const OptimizedConvParallelizationConfig& pconfig, + const Shape& input_shape, const Shape& weights_shape, + uint32_t output_channels, uint32_t groups, + const Conv2dConfig& conv_config, bool enable_bias, bool use_non_tile_height + ) { + + bool untilize_out = conv_config.output_layout == Layout::ROW_MAJOR; + auto output_shape = sliding_window_config.get_output_shape(); + uint32_t batch_size = output_shape[0]; + uint32_t conv_output_h = output_shape[1]; + uint32_t conv_output_w = output_shape[2]; + + auto filter_hw = sliding_window_config.window_hw; + uint32_t input_channels = input_shape[3]; + bool is_depthwise_conv = groups == input_channels && groups == output_channels; + + uint32_t input_tile_size = tt::tile_size(datatype_to_dataformat_converter(input_dtype)); + uint32_t weights_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype)); + uint32_t bias_tile_size = 0; + if(enable_bias) { + bias_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype)); + } + uint32_t output_tile_size = tt::tile_size(datatype_to_dataformat_converter(output_dtype)); + + auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = + get_compute_kernel_config_args(arch, compute_kernel_config); + + uint32_t act_block_w_ntiles = block_config.act_block_w_ntiles; + uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles; + uint32_t act_block_num_tiles = block_config.act_block_h_ntiles * act_block_w_ntiles; + + + uint32_t weight_matrix_height = weights_shape.padded_shape()[2]; + uint32_t weight_matrix_width = weights_shape.padded_shape()[3]; + uint32_t weight_matrix_height_ntiles = weight_matrix_height / tt::constants::TILE_HEIGHT; + uint32_t weight_matrix_width_ntiles = weight_matrix_width / tt::constants::TILE_WIDTH; + + uint32_t per_core_out_matrix_width_ntiles = tt::div_up(pconfig.per_core_out_matrix_width, tt::constants::TILE_WIDTH); + uint32_t per_core_out_matrix_height_ntiles = tt::div_up(pconfig.per_core_out_matrix_height, tt::constants::TILE_HEIGHT); + + uint32_t num_blocks_act_h_per_core = + (per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles; + uint32_t out_block_h_ntiles_padded = num_blocks_act_h_per_core * act_block_h_ntiles; + + if(shard_layout == TensorMemoryLayout::WIDTH_SHARDED) + { + + uint32_t conv_output_c_per_core = per_core_out_matrix_width_ntiles * tt::constants::TILE_WIDTH; + + uint32_t output_size_per_core_in_bytes = per_core_out_matrix_width_ntiles * + per_core_out_matrix_height_ntiles * + tt::tile_size(datatype_to_dataformat_converter(output_dtype)); + + uint32_t act_block_num_bytes = act_block_num_tiles * input_tile_size; + uint32_t tilized_act_block_num_bytes = act_block_num_tiles * output_tile_size; + + + uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; + uint32_t weight_block_num_tiles = weight_block_w_ntiles * act_block_w_ntiles; //act_block_w_ntiles == weight_block_h_ntiles + uint32_t weight_block_num_bytes = weight_block_num_tiles * weights_tile_size; + + uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; + + uint32_t out_block_num_tiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles; + + uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; + + packer_l1_acc = packer_l1_acc && ((enable_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2)); + + auto interm_dtype = + packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; + + uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); + + uint32_t partials_block_num_bytes = out_block_num_tiles * partial_tile_size; + + + //CB 0 + uint32_t cb0_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); + + //CB 1 + uint32_t cb1_size = weight_block_num_bytes; tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); + + //CB 2 + uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); + + //CB 5 + uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); + + //CB 6 + uint32_t cb6_size = act_block_num_bytes; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size); + + //CB 24 + uint32_t cb24_size = partials_block_num_bytes; + if(interm_dtype == output_dtype) { + cb24_size = 0; + } else { + tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); + } + //CB 25 + uint32_t cb25_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); + + uint32_t total_CB_size = cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size; + + tt::log_debug(tt::LogOp, "Total CB Size: {}", total_CB_size); + + return {output_size_per_core_in_bytes, total_CB_size}; + } else if (shard_layout == TensorMemoryLayout::HEIGHT_SHARDED) { + uint32_t output_size = 0; + if(use_non_tile_height){ + uint32_t total_height = conv_output_h * conv_output_w * batch_size; + output_size = total_height / pconfig.num_cores_nhw * output_channels; + } else { + output_size = per_core_out_matrix_height_ntiles * + per_core_out_matrix_width_ntiles * + output_tile_size; + } + + uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; + + uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; + + TT_FATAL(conv_act_c_blocks == 1, "Error: conv_act_c_blocks should be 1 for height sharding"); + uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; + uint32_t weight_block_h_ntiles = is_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles; + + + uint32_t act_block_cb_ntiles = act_block_h_ntiles * act_block_w_ntiles; + + uint32_t act_block_cb_size = act_block_cb_ntiles * input_tile_size; + uint32_t tilzed_act_cb_size = act_block_cb_ntiles * output_tile_size; + + uint32_t output_block_ntiles = out_block_h_ntiles_padded * per_core_out_matrix_width_ntiles; + + uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; + uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles; + uint32_t in0_num_blocks_w = + num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9 + + packer_l1_acc = packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); + + auto interm_dtype = + packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; + + uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); + + uint32_t act_block_split_last_ntiles = 0; + uint32_t act_block_split_ntiles = act_block_cb_ntiles ; + if(conv_config.enable_split_reader) { + uint32_t act_block_h_nsubblocks = block_config.act_block_h_ntiles / block_config.out_subblock_h_ntiles; + uint32_t act_block_split_last_ntiles = act_block_cb_ntiles / 2; + uint32_t act_block_split_ntiles = act_block_cb_ntiles - act_block_split_last_ntiles; + } + + if(conv_config.enable_act_double_buffer) { + act_block_split_last_ntiles *= 2; + act_block_split_ntiles *= 2; + } + //CB 0 + uint32_t cb0_size = act_block_split_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); + + //CB 1 + uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size; + if(num_blocks_act_h > 1) { + cb1_size *= filter_hw.first; + } + if(conv_config.enable_weights_double_buffer) { + cb1_size *= 2; + } + tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); + + + //CB 2 + uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); + + uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); + + uint32_t cb7_size = 0; + + cb7_size = act_block_split_last_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB7 Size: {}", cb7_size); + + //CB 24 + uint32_t cb24_size = output_block_ntiles * partial_tile_size; + if(untilize_out==false && interm_dtype == output_dtype) { + cb24_size = 0; + } + if(is_depthwise_conv) { + cb24_size = output_tile_size; + } + if(cb24_size == 0) { + tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); + } + //CB 25 + uint32_t cb25_size = tilzed_act_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); + + + uint32_t cb26_size = 0; + //CB26 + if(untilize_out) { + cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size); + } + + uint32_t cb27_size = 0; + if(is_depthwise_conv) { + cb27_size = output_tile_size; + } + + return {output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb7_size + cb24_size + cb25_size + cb26_size + cb27_size}; + } else if(shard_layout == TensorMemoryLayout::BLOCK_SHARDED) { + uint32_t output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size; + + uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; + + uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; + + uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; + uint32_t weight_block_h_ntiles = act_block_w_ntiles; + + + uint32_t tilized_act_block_cb_size = act_block_h_ntiles * act_block_w_ntiles * output_tile_size ; + uint32_t row_major_act_cb_size = act_block_h_ntiles * act_block_w_ntiles * input_tile_size ; + + uint32_t output_block_ntiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles; + + uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; + uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles; + uint32_t in0_num_blocks_w = + num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9 + + packer_l1_acc = packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); + + auto interm_dtype = + packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; + + uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); + + + + + //CB 0 + uint32_t cb0_size = tilized_act_block_cb_size; + if(conv_config.enable_act_double_buffer) { + cb0_size *= 2; + } + tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); + + //CB 1 + uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size; + if(conv_config.enable_weights_double_buffer) {cb1_size *= 2;} + + tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); + + + //CB 2 + uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); + + //CB 5 + uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); + + //CB 6 + uint32_t cb6_size = row_major_act_cb_size; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size); + + //CB 24 + uint32_t cb24_size = output_block_ntiles * partial_tile_size; + if(untilize_out==false && interm_dtype == output_dtype) { + cb24_size = 0; + } else { + tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); + } + + //CB 25 + uint32_t cb25_size = tilized_act_block_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); + + + uint32_t cb26_size = 0; + //CB26 + if(untilize_out) { + cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size); + } + + return{ output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size + cb26_size}; + } + return {0, 0}; + +} + + template std::tuple get_conv_padded_input_shape_and_mem_config( Device* device, const ttnn::Tensor& input_tensor_, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp index 349e3837329f..5432f2711dbe 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp @@ -29,67 +29,6 @@ using OutputHeight = uint32_t; using OutputWidth = uint32_t; using Result = std::tuple>; -struct Conv2dConfig { - DataType dtype = DataType::BFLOAT16; - DataType weights_dtype = DataType::BFLOAT16; - string activation = ""; - uint32_t input_channels_alignment = 32; - bool deallocate_activation = false; - bool reallocate_halo_output = false; - uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED. - uint32_t act_block_w_div = 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / (total_num_cores * TILE_WIDTH); - // Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED - bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true - bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true - std::optional shard_layout; - std::optional core_grid = std::nullopt; // used only if override_sharding_config is true - bool transpose_shards = true; // used only if override_sharding_config is true and if height sharding is false - Layout output_layout = Layout::TILE; - bool enable_act_double_buffer = false; - bool enable_weights_double_buffer = false; // Used on for block sharded convolutions - bool enable_split_reader = false; - bool enable_subblock_padding = false; - static constexpr auto attribute_names = std::make_tuple( - "dtype", - "weights_dtype", - "activation", - "input_channels_alignment", - "deallocate_activation", - "reallocate_halo_output", - "act_block_h_override", - "act_block_w_div", - "reshard_if_not_optimal", - "override_sharding_config", - "shard_layout", - "core_grid", - "transpose_shards", - "output_layout", - "enable_act_double_buffer", - "enable_weights_double_buffer", - "enable_split_reader", - "enable_subblock_padding"); - const auto attribute_values() const { - return std::make_tuple( - std::cref(this->dtype), - std::cref(this->weights_dtype), - std::cref(this->activation), - std::cref(this->input_channels_alignment), - std::cref(this->deallocate_activation), - std::cref(this->reallocate_halo_output), - std::cref(this->act_block_h_override), - std::cref(this->act_block_w_div), - std::cref(this->reshard_if_not_optimal), - std::cref(this->override_sharding_config), - std::cref(this->shard_layout), - std::cref(this->core_grid), - std::cref(this->transpose_shards), - std::cref(this->output_layout), - std::cref(this->enable_act_double_buffer), - std::cref(this->enable_weights_double_buffer), - std::cref(this->enable_split_reader), - std::cref(this->enable_subblock_padding)); - } -}; uint32_t find_closest_largest_divisor(uint32_t num, uint32_t start_divisor); @@ -162,8 +101,6 @@ std::tuple get_conv_padded_input_sh bool is_mm_conv, bool is_non_tile_mul_width=false); -OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_output_mem_config( - const MemoryConfig& conv_output_mem_config, uint32_t num_cores_nhw, uint32_t num_cores_c); void adjust_conv_op_config_for_auto_shard_if_necessary( bool is_mm_conv, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp index d2d80faa594a..a9c32d4b6eed 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp @@ -76,7 +76,6 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optionalget_memory_allocation_statistics(tt::tt_metal::BufferType::L1); - tt::log_info(tt::LogOp, "Allocation Stats before Op: {}", stats); std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({a, b}))}; operation::launch_op( [sliding_window_config, output_channels, groups, untilize_out, fuse_relu, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height, stats] @@ -267,11 +266,25 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect use_non_tile_height); if(std::getenv("TT_DISABLE_CONV_L1_CHECK")==nullptr) { + tt::log_info(tt::LogOp, "Allocation Stats before Op: {}", this->pre_op_l1_allocation_size_bytes); program_with_cbs.program.set_pre_exec_callback([this, arch, input_dtype, weights_dtype, output_dtype, weights_shape, has_bias, device](const Program& program) { const uint32_t post_op_l1_stats = device->get_memory_allocation_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes; auto actual_cb_size = program.get_max_cb_memory_usage(device); - auto [calc_output_size, calc_CB_size] = this->estimate_L1_usage(arch, input_dtype, weights_dtype, output_dtype, weights_shape, has_bias); + auto [calc_output_size, calc_CB_size] = estimate_L1_usage( + arch, this->memory_config.memory_layout, + input_dtype, weights_dtype, output_dtype, + sliding_window_config, compute_kernel_config, + block_config, parallelization_config, + input_tensor_shape, weights_shape, + output_channels, groups, + Conv2dConfig{ + .enable_act_double_buffer=enable_act_double_buffer, + .enable_weights_double_buffer=enable_weights_double_buffer, + .enable_split_reader=enable_split_reader, + .enable_subblock_padding=enable_subblock_padding + }, + has_bias, use_non_tile_height); if(calc_CB_size > 0) { if(calc_CB_size != actual_cb_size) { tt::log_error("Calculated CB size {} does not match with the actual CB size {}",calc_CB_size,actual_cb_size); @@ -288,285 +301,6 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect return program_with_cbs; } -std::pair OptimizedConvNew::estimate_L1_usage(tt::ARCH arch, const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype, const tt::tt_metal::LegacyShape weights_shape, bool enable_bias) const { - - auto output_shape = sliding_window_config.get_output_shape(); - uint32_t batch_size = output_shape[0]; - uint32_t conv_output_h = output_shape[1]; - uint32_t conv_output_w = output_shape[2]; - - auto filter_hw = sliding_window_config.window_hw; - uint32_t input_channels = this->input_tensor_shape[3]; - bool is_depthwise_conv = this->groups == input_channels && groups == output_channels; - - uint32_t input_tile_size = tt::tile_size(datatype_to_dataformat_converter(input_dtype)); - uint32_t weights_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype)); - uint32_t bias_tile_size = 0; - if(enable_bias) { - bias_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype)); - } - uint32_t output_tile_size = tt::tile_size(datatype_to_dataformat_converter(output_dtype)); - - auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = - get_compute_kernel_config_args(arch, compute_kernel_config); - - uint32_t act_block_w_ntiles = this->block_config.act_block_w_ntiles; - uint32_t act_block_h_ntiles = this->block_config.act_block_h_ntiles; - uint32_t act_block_num_tiles = this->block_config.act_block_h_ntiles * act_block_w_ntiles; - - - uint32_t weight_matrix_height = weights_shape[2]; - uint32_t weight_matrix_width = weights_shape[3]; - uint32_t weight_matrix_height_ntiles = weight_matrix_height / TILE_HEIGHT; - uint32_t weight_matrix_width_ntiles = weight_matrix_width / TILE_WIDTH; - - uint32_t per_core_out_matrix_width_ntiles = tt::div_up(this->parallelization_config.per_core_out_matrix_width, TILE_WIDTH); - uint32_t per_core_out_matrix_height_ntiles = tt::div_up(this->parallelization_config.per_core_out_matrix_height, TILE_HEIGHT); - - uint32_t num_blocks_act_h_per_core = - (per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles; - uint32_t out_block_h_ntiles_padded = num_blocks_act_h_per_core * act_block_h_ntiles; - - if(this->memory_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED) - { - - uint32_t conv_output_c_per_core = per_core_out_matrix_width_ntiles * TILE_WIDTH; - - uint32_t output_size_per_core_in_bytes = per_core_out_matrix_width_ntiles * per_core_out_matrix_height_ntiles * tt::tile_size(datatype_to_dataformat_converter(this->dtype)); - - uint32_t act_block_num_bytes = act_block_num_tiles * input_tile_size; - uint32_t tilized_act_block_num_bytes = act_block_num_tiles * output_tile_size; - - - uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; - uint32_t weight_block_num_tiles = weight_block_w_ntiles * act_block_w_ntiles; //act_block_w_ntiles == weight_block_h_ntiles - uint32_t weight_block_num_bytes = weight_block_num_tiles * weights_tile_size; - - uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; - - uint32_t out_block_num_tiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles; - - uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; - - packer_l1_acc = packer_l1_acc && ((has_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2)); - - auto interm_dtype = - packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; - - uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); - - uint32_t partials_block_num_bytes = out_block_num_tiles * partial_tile_size; - - - //CB 0 - uint32_t cb0_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); - - //CB 1 - uint32_t cb1_size = weight_block_num_bytes; tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); - - //CB 2 - uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); - - //CB 5 - uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); - - //CB 6 - uint32_t cb6_size = act_block_num_bytes; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size); - - //CB 24 - uint32_t cb24_size = partials_block_num_bytes; - if(interm_dtype == output_dtype) { - cb24_size = 0; - } else { - tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); - } - //CB 25 - uint32_t cb25_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); - - uint32_t total_CB_size = cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size; - - tt::log_debug(tt::LogOp, "Total CB Size: {}", total_CB_size); - - return {output_size_per_core_in_bytes, total_CB_size}; - } else if (this->memory_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) { - uint32_t output_size = 0; - if(this->use_non_tile_height){ - uint32_t total_height = conv_output_h * conv_output_w * batch_size; - output_size = total_height / this->parallelization_config.num_cores_nhw * this->output_channels; - } else { - output_size = per_core_out_matrix_height_ntiles * - per_core_out_matrix_width_ntiles * - output_tile_size; - } - - uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; - - uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; - - TT_FATAL(conv_act_c_blocks == 1, "Error: conv_act_c_blocks should be 1 for height sharding"); - uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; - uint32_t weight_block_h_ntiles = is_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles; - - - uint32_t act_block_cb_ntiles = act_block_h_ntiles * act_block_w_ntiles; - - uint32_t act_block_cb_size = act_block_cb_ntiles * input_tile_size; - uint32_t tilzed_act_cb_size = act_block_cb_ntiles * output_tile_size; - - uint32_t output_block_ntiles = out_block_h_ntiles_padded * per_core_out_matrix_width_ntiles; - - uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; - uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles; - uint32_t in0_num_blocks_w = - num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9 - - packer_l1_acc = packer_l1_acc && ((has_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); - - auto interm_dtype = - packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; - - uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); - - uint32_t act_block_split_last_ntiles = 0; - uint32_t act_block_split_ntiles = act_block_cb_ntiles ; - if(this->enable_split_reader) { - uint32_t act_block_h_nsubblocks = block_config.act_block_h_ntiles / block_config.out_subblock_h_ntiles; - uint32_t act_block_split_last_ntiles = act_block_cb_ntiles / 2; - uint32_t act_block_split_ntiles = act_block_cb_ntiles - act_block_split_last_ntiles; - } - - if(this->enable_act_double_buffer) { - act_block_split_last_ntiles *= 2; - act_block_split_ntiles *= 2; - } - //CB 0 - uint32_t cb0_size = act_block_split_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); - - //CB 1 - uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size; - if(num_blocks_act_h > 1) { - cb1_size *= filter_hw.first; - } - if(this->enable_weights_double_buffer) { - cb1_size *= 2; - } - tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); - - - //CB 2 - uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); - - uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); - - uint32_t cb7_size = 0; - - cb7_size = act_block_split_last_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB7 Size: {}", cb7_size); - - //CB 24 - uint32_t cb24_size = output_block_ntiles * partial_tile_size; - if(this->untilize_out==false && interm_dtype == output_dtype) { - cb24_size = 0; - } - if(is_depthwise_conv) { - cb24_size = output_tile_size; - } - if(cb24_size == 0) { - tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); - } - //CB 25 - uint32_t cb25_size = tilzed_act_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); - - - uint32_t cb26_size = 0; - //CB26 - if(this->untilize_out) { - cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size); - } - - uint32_t cb27_size = 0; - if(is_depthwise_conv) { - cb27_size = output_tile_size; - } - - return {output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb7_size + cb24_size + cb25_size + cb26_size + cb27_size}; - } else if(this->memory_config.memory_layout == TensorMemoryLayout::BLOCK_SHARDED) { - uint32_t output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size; - - uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size; - - uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles; - - uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles; - uint32_t weight_block_h_ntiles = act_block_w_ntiles; - - - uint32_t tilized_act_block_cb_size = act_block_h_ntiles * act_block_w_ntiles * output_tile_size ; - uint32_t row_major_act_cb_size = act_block_h_ntiles * act_block_w_ntiles * input_tile_size ; - - uint32_t output_block_ntiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles; - - uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles; - uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles; - uint32_t in0_num_blocks_w = - num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9 - - packer_l1_acc = packer_l1_acc && ((has_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); - - auto interm_dtype = - packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype; - - uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype)); - - - - - //CB 0 - uint32_t cb0_size = tilized_act_block_cb_size; - if(this->enable_act_double_buffer) { - cb0_size *= 2; - } - tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size); - - //CB 1 - uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size; - if(this->enable_weights_double_buffer) {cb1_size *= 2;} - - tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size); - - - //CB 2 - uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size); - - //CB 5 - uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size); - - //CB 6 - uint32_t cb6_size = row_major_act_cb_size; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size); - - //CB 24 - uint32_t cb24_size = output_block_ntiles * partial_tile_size; - if(this->untilize_out==false && interm_dtype == output_dtype) { - cb24_size = 0; - } else { - tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size); - } - - //CB 25 - uint32_t cb25_size = tilized_act_block_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size); - - - uint32_t cb26_size = 0; - //CB26 - if(this->untilize_out) { - cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size); - } - - return{ output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size + cb26_size}; - } - return {0, 0}; - -} - operation::OpPerformanceModel OptimizedConvNew::create_op_performance_model(const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector &output_tensors) const { const auto& input_tensor_a_shape = this->input_tensor_shape; uint32_t batch_size = input_tensor_a_shape[0]; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index 14ce7a09c6c6..d405a6736cf1 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -15,6 +15,69 @@ namespace ttnn { namespace operations::conv { namespace conv2d { +struct Conv2dConfig { + DataType dtype = DataType::BFLOAT16; + DataType weights_dtype = DataType::BFLOAT16; + string activation = ""; + uint32_t input_channels_alignment = 32; + bool deallocate_activation = false; + bool reallocate_halo_output = false; + uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED. + uint32_t act_block_w_div = 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / (total_num_cores * TILE_WIDTH); + // Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED + bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true + bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true + std::optional shard_layout; + std::optional core_grid = std::nullopt; // used only if override_sharding_config is true + bool transpose_shards = true; // used only if override_sharding_config is true and if height sharding is false + Layout output_layout = Layout::TILE; + bool enable_act_double_buffer = false; + bool enable_weights_double_buffer = false; // Used on for block sharded convolutions + bool enable_split_reader = false; + bool enable_subblock_padding = false; + static constexpr auto attribute_names = std::make_tuple( + "dtype", + "weights_dtype", + "activation", + "input_channels_alignment", + "deallocate_activation", + "reallocate_halo_output", + "act_block_h_override", + "act_block_w_div", + "reshard_if_not_optimal", + "override_sharding_config", + "shard_layout", + "core_grid", + "transpose_shards", + "output_layout", + "enable_act_double_buffer", + "enable_weights_double_buffer", + "enable_split_reader", + "enable_subblock_padding"); + const auto attribute_values() const { + return std::make_tuple( + std::cref(this->dtype), + std::cref(this->weights_dtype), + std::cref(this->activation), + std::cref(this->input_channels_alignment), + std::cref(this->deallocate_activation), + std::cref(this->reallocate_halo_output), + std::cref(this->act_block_h_override), + std::cref(this->act_block_w_div), + std::cref(this->reshard_if_not_optimal), + std::cref(this->override_sharding_config), + std::cref(this->shard_layout), + std::cref(this->core_grid), + std::cref(this->transpose_shards), + std::cref(this->output_layout), + std::cref(this->enable_act_double_buffer), + std::cref(this->enable_weights_double_buffer), + std::cref(this->enable_split_reader), + std::cref(this->enable_subblock_padding)); + } +}; + + // TODO: Accept parallelization enum class OptimizedConvOpParallelizationStrategy { MULTI_CORE, MULTI_CORE_REUSE, MULTI_CORE_REUSE_MCAST, SINGLE_CORE @@ -109,8 +172,6 @@ struct OptimizedConvNew { enable_subblock_padding(enable_subblock_padding), use_non_tile_height(use_non_tile_height) {} - std::pair estimate_L1_usage(tt::ARCH arch, const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype, const tt::tt_metal::LegacyShape weights_shape, bool enable_bias) const; - void validate(const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; std::vector compute_output_shapes(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; @@ -171,6 +232,16 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional estimate_L1_usage( + tt::ARCH arch, TensorMemoryLayout shard_layout, + const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype, + const sliding_window::SlidingWindowConfig& sliding_window_config, const DeviceComputeKernelConfig& compute_kernel_config, + const OptimizedConvBlockConfig& block_config, const OptimizedConvParallelizationConfig& pconfig, + const Shape& input_shape, const Shape& weights_shape, + uint32_t output_channels, uint32_t groups, + const Conv2dConfig& conv_config, bool enable_bias, bool use_non_tile_height + ); + } // namespace conv2d } // namespace operations::conv