Skip to content

Commit

Permalink
#0: Refactor estimate_L1
Browse files Browse the repository at this point in the history
  • Loading branch information
sankarmanoj-tt committed Dec 11, 2024
1 parent 3f52fd5 commit 73a0fe8
Show file tree
Hide file tree
Showing 4 changed files with 397 additions and 346 deletions.
309 changes: 309 additions & 0 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@
#include <sys/types.h>
#include <cstdint>
#include <optional>
#include <tuple>

#include "conv2d_utils.hpp"
#include "common/constants.hpp"
#include "impl/buffers/buffer_constants.hpp"
#include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
#include "ttnn/operations/core/core.hpp"
#include "ttnn/operations/pool/downsample/device/downsample_op.hpp"
#include "tt_metal/detail/reports/memory_reporter.hpp"
#include "tt_metal/common/work_split.hpp"
#include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
#include "ttnn/operations/sliding_window/sliding_window.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "tt_metal/common/core_coord.hpp"

Expand Down Expand Up @@ -397,6 +400,7 @@ bool use_matmul_for_1x1_conv(
padding[1] == 0 && dilation[0] == 1 && dilation[1] == 1 && groups == 1;
}


// Implements a heuristic for selecting shard layout based on how many tenix cores are available
// for each shard.
static TensorMemoryLayout select_shard_spec(
Expand All @@ -410,6 +414,20 @@ static TensorMemoryLayout select_shard_spec(
uint32_t input_width,
ShardOrientation shard_orientation,
const CoreCoord& compute_grid_size) {
auto get_conv2d_resources_for_shard_scheme = [&](TensorMemoryLayout shard_layout)->std::tuple<uint32_t, uint32_t> {
auto pconfig = determine_parallel_config(
shard_layout,
batch_size,
in_channels,
output_height,
output_width,
out_channels,
compute_grid_size,
shard_orientation,
is_mm_conv);

return {0, 0};
};
auto get_core_count_for_sharding = [&](TensorMemoryLayout shard_layout) {
return determine_parallel_config(
shard_layout,
Expand Down Expand Up @@ -875,6 +893,297 @@ void adjust_conv_op_config_for_auto_shard_if_necessary(
}
}

std::pair<uint32_t,uint32_t> conv2d::estimate_L1_usage(
tt::ARCH arch, TensorMemoryLayout shard_layout,
const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype,
const SlidingWindowConfig& sliding_window_config, const DeviceComputeKernelConfig& compute_kernel_config,
const OptimizedConvBlockConfig& block_config, const OptimizedConvParallelizationConfig& pconfig,
const Shape& input_shape, const Shape& weights_shape,
uint32_t output_channels, uint32_t groups,
const Conv2dConfig& conv_config, bool enable_bias, bool use_non_tile_height
) {

bool untilize_out = conv_config.output_layout == Layout::ROW_MAJOR;
auto output_shape = sliding_window_config.get_output_shape();
uint32_t batch_size = output_shape[0];
uint32_t conv_output_h = output_shape[1];
uint32_t conv_output_w = output_shape[2];

auto filter_hw = sliding_window_config.window_hw;
uint32_t input_channels = input_shape[3];
bool is_depthwise_conv = groups == input_channels && groups == output_channels;

uint32_t input_tile_size = tt::tile_size(datatype_to_dataformat_converter(input_dtype));
uint32_t weights_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype));
uint32_t bias_tile_size = 0;
if(enable_bias) {
bias_tile_size = tt::tile_size(datatype_to_dataformat_converter(weights_dtype));
}
uint32_t output_tile_size = tt::tile_size(datatype_to_dataformat_converter(output_dtype));

auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] =
get_compute_kernel_config_args(arch, compute_kernel_config);

uint32_t act_block_w_ntiles = block_config.act_block_w_ntiles;
uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles;
uint32_t act_block_num_tiles = block_config.act_block_h_ntiles * act_block_w_ntiles;


uint32_t weight_matrix_height = weights_shape.padded_shape()[2];
uint32_t weight_matrix_width = weights_shape.padded_shape()[3];
uint32_t weight_matrix_height_ntiles = weight_matrix_height / tt::constants::TILE_HEIGHT;
uint32_t weight_matrix_width_ntiles = weight_matrix_width / tt::constants::TILE_WIDTH;

uint32_t per_core_out_matrix_width_ntiles = tt::div_up(pconfig.per_core_out_matrix_width, tt::constants::TILE_WIDTH);
uint32_t per_core_out_matrix_height_ntiles = tt::div_up(pconfig.per_core_out_matrix_height, tt::constants::TILE_HEIGHT);

uint32_t num_blocks_act_h_per_core =
(per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles;
uint32_t out_block_h_ntiles_padded = num_blocks_act_h_per_core * act_block_h_ntiles;

if(shard_layout == TensorMemoryLayout::WIDTH_SHARDED)
{

uint32_t conv_output_c_per_core = per_core_out_matrix_width_ntiles * tt::constants::TILE_WIDTH;

uint32_t output_size_per_core_in_bytes = per_core_out_matrix_width_ntiles *
per_core_out_matrix_height_ntiles *
tt::tile_size(datatype_to_dataformat_converter(output_dtype));

uint32_t act_block_num_bytes = act_block_num_tiles * input_tile_size;
uint32_t tilized_act_block_num_bytes = act_block_num_tiles * output_tile_size;


uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles;
uint32_t weight_block_num_tiles = weight_block_w_ntiles * act_block_w_ntiles; //act_block_w_ntiles == weight_block_h_ntiles
uint32_t weight_block_num_bytes = weight_block_num_tiles * weights_tile_size;

uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size;

uint32_t out_block_num_tiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles;

uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles;

packer_l1_acc = packer_l1_acc && ((enable_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2));

auto interm_dtype =
packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype;

uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype));

uint32_t partials_block_num_bytes = out_block_num_tiles * partial_tile_size;


//CB 0
uint32_t cb0_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size);

//CB 1
uint32_t cb1_size = weight_block_num_bytes; tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size);

//CB 2
uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size);

//CB 5
uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size);

//CB 6
uint32_t cb6_size = act_block_num_bytes; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size);

//CB 24
uint32_t cb24_size = partials_block_num_bytes;
if(interm_dtype == output_dtype) {
cb24_size = 0;
} else {
tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size);
}
//CB 25
uint32_t cb25_size = tilized_act_block_num_bytes; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size);

uint32_t total_CB_size = cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size;

tt::log_debug(tt::LogOp, "Total CB Size: {}", total_CB_size);

return {output_size_per_core_in_bytes, total_CB_size};
} else if (shard_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
uint32_t output_size = 0;
if(use_non_tile_height){
uint32_t total_height = conv_output_h * conv_output_w * batch_size;
output_size = total_height / pconfig.num_cores_nhw * output_channels;
} else {
output_size = per_core_out_matrix_height_ntiles *
per_core_out_matrix_width_ntiles *
output_tile_size;
}

uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size;

uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles;

TT_FATAL(conv_act_c_blocks == 1, "Error: conv_act_c_blocks should be 1 for height sharding");
uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles;
uint32_t weight_block_h_ntiles = is_depthwise_conv ? act_block_h_ntiles : act_block_w_ntiles;


uint32_t act_block_cb_ntiles = act_block_h_ntiles * act_block_w_ntiles;

uint32_t act_block_cb_size = act_block_cb_ntiles * input_tile_size;
uint32_t tilzed_act_cb_size = act_block_cb_ntiles * output_tile_size;

uint32_t output_block_ntiles = out_block_h_ntiles_padded * per_core_out_matrix_width_ntiles;

uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles;
uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles;
uint32_t in0_num_blocks_w =
num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9

packer_l1_acc = packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));

auto interm_dtype =
packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype;

uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype));

uint32_t act_block_split_last_ntiles = 0;
uint32_t act_block_split_ntiles = act_block_cb_ntiles ;
if(conv_config.enable_split_reader) {
uint32_t act_block_h_nsubblocks = block_config.act_block_h_ntiles / block_config.out_subblock_h_ntiles;
uint32_t act_block_split_last_ntiles = act_block_cb_ntiles / 2;
uint32_t act_block_split_ntiles = act_block_cb_ntiles - act_block_split_last_ntiles;
}

if(conv_config.enable_act_double_buffer) {
act_block_split_last_ntiles *= 2;
act_block_split_ntiles *= 2;
}
//CB 0
uint32_t cb0_size = act_block_split_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size);

//CB 1
uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size;
if(num_blocks_act_h > 1) {
cb1_size *= filter_hw.first;
}
if(conv_config.enable_weights_double_buffer) {
cb1_size *= 2;
}
tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size);


//CB 2
uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size);

uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size);

uint32_t cb7_size = 0;

cb7_size = act_block_split_last_ntiles * input_tile_size; tt::log_debug(tt::LogOp, "CB7 Size: {}", cb7_size);

//CB 24
uint32_t cb24_size = output_block_ntiles * partial_tile_size;
if(untilize_out==false && interm_dtype == output_dtype) {
cb24_size = 0;
}
if(is_depthwise_conv) {
cb24_size = output_tile_size;
}
if(cb24_size == 0) {
tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size);
}
//CB 25
uint32_t cb25_size = tilzed_act_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size);


uint32_t cb26_size = 0;
//CB26
if(untilize_out) {
cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size);
}

uint32_t cb27_size = 0;
if(is_depthwise_conv) {
cb27_size = output_tile_size;
}

return {output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb7_size + cb24_size + cb25_size + cb26_size + cb27_size};
} else if(shard_layout == TensorMemoryLayout::BLOCK_SHARDED) {
uint32_t output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;

uint32_t bias_block_num_bytes = per_core_out_matrix_width_ntiles * bias_tile_size;

uint32_t conv_act_c_blocks = weight_matrix_width_ntiles / per_core_out_matrix_width_ntiles;

uint32_t weight_block_w_ntiles = per_core_out_matrix_width_ntiles;
uint32_t weight_block_h_ntiles = act_block_w_ntiles;


uint32_t tilized_act_block_cb_size = act_block_h_ntiles * act_block_w_ntiles * output_tile_size ;
uint32_t row_major_act_cb_size = act_block_h_ntiles * act_block_w_ntiles * input_tile_size ;

uint32_t output_block_ntiles = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles;

uint32_t num_blocks_act_w = weight_matrix_height_ntiles / act_block_w_ntiles;
uint32_t num_blocks_act_h = per_core_out_matrix_height_ntiles / act_block_h_ntiles;
uint32_t in0_num_blocks_w =
num_blocks_act_w * conv_act_c_blocks; // Fold outer c_block loop together with weight_block_num_tiles = 9

packer_l1_acc = packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));

auto interm_dtype =
packer_l1_acc ? (fp32_dest_acc_en ? DataType::FLOAT32 : DataType::BFLOAT16) : output_dtype;

uint32_t partial_tile_size = tt::tile_size(datatype_to_dataformat_converter(interm_dtype));




//CB 0
uint32_t cb0_size = tilized_act_block_cb_size;
if(conv_config.enable_act_double_buffer) {
cb0_size *= 2;
}
tt::log_debug(tt::LogOp, "CB0 Size: {}", cb0_size);

//CB 1
uint32_t cb1_size = weight_block_h_ntiles * weight_block_w_ntiles * weights_tile_size;
if(conv_config.enable_weights_double_buffer) {cb1_size *= 2;}

tt::log_debug(tt::LogOp, "CB1 Size: {}", cb1_size);


//CB 2
uint32_t cb2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", cb2_size);

//CB 5
uint32_t cb5_size = 64; tt::log_debug(tt::LogOp, "CB5 Size: {}", cb5_size);

//CB 6
uint32_t cb6_size = row_major_act_cb_size; tt::log_debug(tt::LogOp, "CB6 Size: {}", cb6_size);

//CB 24
uint32_t cb24_size = output_block_ntiles * partial_tile_size;
if(untilize_out==false && interm_dtype == output_dtype) {
cb24_size = 0;
} else {
tt::log_debug(tt::LogOp, "CB24 Size: {}", cb24_size);
}

//CB 25
uint32_t cb25_size = tilized_act_block_cb_size; tt::log_debug(tt::LogOp, "CB25 Size: {}", cb25_size);


uint32_t cb26_size = 0;
//CB26
if(untilize_out) {
cb26_size = weight_block_w_ntiles * output_tile_size; tt::log_debug(tt::LogOp, "CB26 Size: {}", cb26_size);
}

return{ output_size, cb0_size + cb1_size + cb2_size + cb5_size + cb6_size + cb24_size + cb25_size + cb26_size};
}
return {0, 0};

}


template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config<Device>(
Device* device,
const ttnn::Tensor& input_tensor_,
Expand Down
Loading

0 comments on commit 73a0fe8

Please sign in to comment.