Skip to content

Commit

Permalink
#0: Func for packer_l1_acc_en calc
Browse files Browse the repository at this point in the history
  • Loading branch information
sankarmanoj-tt committed Dec 24, 2024
1 parent 9592cf1 commit 11b176c
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 3 deletions.
5 changes: 4 additions & 1 deletion ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,6 @@ conv_op_l1_usage conv2d::estimate_L1_usage(
}
tt::log_debug(tt::LogOp, "CB1 Size: {}", weights_cb_1_size);


//CB 2
uint32_t bias_cb_2_size = bias_block_num_bytes;
tt::log_debug(tt::LogOp, "CB2 Size: {}", bias_cb_2_size);
Expand Down Expand Up @@ -1346,6 +1345,10 @@ conv_op_l1_usage conv2d::estimate_L1_usage(

}

bool conv2d::determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w){
return packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));
}


template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config<Device>(
Device* device,
Expand Down
10 changes: 10 additions & 0 deletions ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,21 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
bool use_non_tile_height = false
);


// Only enable packer l1 accumulation when there are in0_num_blocks_w > 2, otherwise
// unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation
// does a spill and reload, so need more than 2 blocks to use l1 acc for packer
// For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload
bool determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w);

struct conv_op_l1_usage{
uint32_t tensor_allocation_size;
uint32_t CB_allocation_size;
};

//This function calculates how much L1 will be allocated by the conv2d op.
//L1 allocation is either for the output tensor or for Circular Buffers.
//This doesn't include Circular Buffers that use globally allocated addresses, as these don't need memory allocation.
conv_op_l1_usage estimate_L1_usage(
tt::ARCH arch, TensorMemoryLayout shard_layout,
const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
// unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation
// does a spill and reload, so need more than 2 blocks to use l1 acc for packer
// For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload
bool packer_l1_acc_en = packer_l1_acc && ((has_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));
bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, in0_num_blocks_w);

std::tuple<CBHandle, CBHandle> input_output_cbs = {0, 0};
if (is_conv1d and is_depthwise_conv) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
"Number of Act Blocks along the Width {} should be divisible by the number of cores {}",
num_blocks_act_w,
input_num_cores);
bool packer_l1_acc_en = packer_l1_acc && ((has_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2));
bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, num_blocks_act_w);
tt::DataFormat interm0_df =
packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df;
log_debug(LogOp, "interm0_df: {}", interm0_df);
Expand Down

0 comments on commit 11b176c

Please sign in to comment.