#0: Func for packer_l1_acc_en calc

tenstorrent · Dec 24, 2024 · 11b176c · 11b176c
1 parent 9592cf1
commit 11b176c
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 3 deletions.
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -1208,7 +1208,6 @@ conv_op_l1_usage conv2d::estimate_L1_usage(
         }
         tt::log_debug(tt::LogOp, "CB1 Size: {}", weights_cb_1_size);
 
-
         //CB 2
         uint32_t bias_cb_2_size = bias_block_num_bytes;
         tt::log_debug(tt::LogOp, "CB2 Size: {}", bias_cb_2_size);
@@ -1346,6 +1345,10 @@ conv_op_l1_usage conv2d::estimate_L1_usage(
 
 }
 
+bool conv2d::determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w){
+    return packer_l1_acc && ((enable_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));
+}
+
 
 template std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_shape_and_mem_config<Device>(
     Device* device,

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -235,11 +235,21 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
     bool use_non_tile_height = false
 );
 
+
+// Only enable packer l1 accumulation when there are in0_num_blocks_w > 2, otherwise
+// unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation
+// does a spill and reload, so need more than 2 blocks to use l1 acc for packer
+// For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload
+bool determine_packer_l1_acc(bool packer_l1_acc, bool enable_bias, uint32_t in0_num_blocks_w);
+
 struct conv_op_l1_usage{
     uint32_t tensor_allocation_size;
     uint32_t CB_allocation_size;
 };
 
+//This function calculates how much L1 will be allocated by the conv2d op.
+//L1 allocation is either for the output tensor or for Circular Buffers.
+//This doesn't include Circular Buffers that use globally allocated addresses, as these don't need memory allocation.
 conv_op_l1_usage estimate_L1_usage(
     tt::ARCH arch, TensorMemoryLayout shard_layout,
     const DataType input_dtype, const DataType weights_dtype, const DataType output_dtype,

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -1101,7 +1101,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     // unnecessary overhead for reconfigs are added. Last iteration of l1 accumulation
     // does a spill and reload, so need more than 2 blocks to use l1 acc for packer
     // For bias, last iteration of l1 acc remains in intermediate buffer, does not spill and reload
-    bool packer_l1_acc_en = packer_l1_acc && ((has_bias && in0_num_blocks_w > 1) || (in0_num_blocks_w > 2));
+    bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, in0_num_blocks_w);
 
     std::tuple<CBHandle, CBHandle> input_output_cbs = {0, 0};
     if (is_conv1d and is_depthwise_conv) {

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
@@ -305,7 +305,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
         "Number of Act Blocks along the Width {} should be divisible by the number of cores {}",
         num_blocks_act_w,
         input_num_cores);
-    bool packer_l1_acc_en = packer_l1_acc && ((has_bias && num_blocks_act_w > 1) || (num_blocks_act_w > 2));
+    bool packer_l1_acc_en = determine_packer_l1_acc(packer_l1_acc, has_bias, num_blocks_act_w);
     tt::DataFormat interm0_df =
         packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df;
     log_debug(LogOp, "interm0_df: {}", interm0_df);