From 11b176c35d63cc967fd101a790d86f31305d928c Mon Sep 17 00:00:00 2001 From: Sankar Manoj Date: Tue, 24 Dec 2024 10:52:39 +0000 Subject: [PATCH] #0: Func for packer_l1_acc_en calc --- ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp | 5 ++++- .../ttnn/operations/conv/conv2d/device/conv2d_op.hpp | 10 ++++++++++ .../device/conv2d_op_sharded_program_factory.cpp | 2 +- .../device/conv2d_op_width_sharded_program_factory.cpp | 2 +- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 5fcc28d3f2c4..e56954135d87 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -1208,7 +1208,6 @@ conv_op_l1_usage conv2d::estimate_L1_usage( } tt::log_debug(tt::LogOp, "CB1 Size: {}", weights_cb_1_size); - //CB 2 uint32_t bias_cb_2_size = bias_block_num_bytes; tt::log_debug(tt::LogOp, "CB2 Size: {}", bias_cb_2_size); @@ -1346,6 +1345,10 @@ conv_op_l1_usage conv2d::estimate_L1_usage( } +bool conv2d::determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w){ + return packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); +} + template std::tuple get_conv_padded_input_shape_and_mem_config( Device* device, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index be7f05f67146..cf9a857ffad4 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -235,11 +235,21 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional 2, otherwise +// unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation +// does a spill and reload, so need more than 2 blocks to use l1 acc for packer +// For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload +bool determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w); + struct conv_op_l1_usage{ uint32_t tensor_allocation_size; uint32_t CB_allocation_size; }; +//This function calculates how much L1 will be allocated by the conv2d op. +//L1 allocation is either for the output tensor or for Circular Buffers. +//This doesn't include Circular Buffers that use globally allocated addresses, as these don't need memory allocation. conv_op_l1_usage estimate_L1_usage( tt::ARCH arch, TensorMemoryLayout shard_layout, const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index e91b1f85f6fc..b270d40391be 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -1101,7 +1101,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation // does a spill and reload, so need more than 2 blocks to use l1 acc for packer // For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload - bool packer_l1_acc_en = packer_l1_acc && ((has_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2)); + bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, in0_num_blocks_w); std::tuple input_output_cbs = {0, 0}; if (is_conv1d and is_depthwise_conv) { diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 37541d11b14a..efe988ebb6f0 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -305,7 +305,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh "Number of Act Blocks along the Width {} should be divisible by the number of cores {}", num_blocks_act_w, input_num_cores); - bool packer_l1_acc_en = packer_l1_acc && ((has_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2)); + bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, num_blocks_act_w); tt::DataFormat interm0_df = packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df; log_debug(LogOp, "interm0_df: {}", interm0_df);