13541 set act_block_w_div to max value

Chose max valid value for act_block_w_div in auto-sharding code path for conv2d in order to minimize L1 usage for for WIDTH_SHARDED convs. This improves pass rate on ttnn torch traces.
tenstorrent · Dec 4, 2024 · 68bc110 · 68bc110
1 parent 77e056e
commit 68bc110
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 6 deletions.
diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
@@ -456,7 +456,6 @@ def test_conv2d_localrun(device, input_spec):
     # Input is 32MB maps to MM 64 cores, we neeed to avoid sharding this tensor and use dram intrelaved directly with MM
     [1, 256, 1024, 128, 128, 1, 1, 1, 1, 0, 0, 1, False, 1],  # 5
     [1, 1056, 1056, 96, 96, 3, 3, 2, 2, 1, 1, 4, False, 1],  # 14
-    [1, 2904, 2904, 24, 24, 3, 3, 1, 1, 1, 1, 11, False, 1],  # 169
     [1, 2904, 2904, 48, 48, 3, 3, 2, 2, 1, 1, 11, False, 1],  # 170
     [1, 1024, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 172
     [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, False, 1],  # 181
@@ -465,7 +464,6 @@ def test_conv2d_localrun(device, input_spec):
     [1, 64, 3, 720, 1280, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 203
     [1, 64, 3, 800, 1088, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 204
     [1, 528, 528, 192, 192, 3, 3, 2, 2, 1, 1, 2, False, 1],  # 292
-    [1, 7392, 7392, 24, 24, 3, 3, 2, 2, 1, 1, 28, False, 1],  # 366
     [1, 816, 816, 19, 19, 5, 5, 1, 1, 2, 2, 816, False, 1],  # 373
     [1, 816, 816, 23, 23, 5, 5, 2, 2, 0, 0, 816, False, 1],  # 374
     [1, 960, 960, 24, 24, 5, 5, 1, 1, 2, 2, 960, False, 1],  # 394

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -153,7 +153,7 @@ def run_conv(
     if config_override and "act_block_h" in config_override and not auto_shard:
         conv_config.act_block_h_override = config_override["act_block_h"]
 
-    if config_override and "act_block_w_div" in config_override:
+    if config_override and "act_block_w_div" in config_override and not auto_shard:
         conv_config.act_block_w_div = config_override["act_block_w_div"]
 
     if config_override and "num_cores_nhw" in config_override:
@@ -635,7 +635,7 @@ def test_conv_ws(
         enable_split_reader=False,
         enable_subblock_padding=False,
         reshard_if_not_optimal=True,
-        act_block_w_div=act_block_w_div,
+        act_block_w_div=act_block_w_div if not auto_shard else 1,
         act_block_h_override=32,
     )
     [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d(

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -333,6 +333,7 @@ OptimizedConvBlockConfig determine_per_core_conv_block_config(
                                ? round_up(padded_in_channels * window_w, 32)
                                : padded_in_channels;
     if(parallel_config.shard_scheme == TensorMemoryLayout::WIDTH_SHARDED) {
+        TT_ASSERT(padded_in_channels % (32 * parallel_config.grid.num_cores() * act_block_w_div) == 0);
         act_block_w = (padded_in_channels * window_h * window_w)/(parallel_config.grid.num_cores() * act_block_w_div);
     }
     TT_ASSERT(act_block_w % 32 == 0);
@@ -877,6 +878,25 @@ static void adjust_conv_op_config_for_auto_shard(
         // be conservative with L1 memory usage.
         conv_config.act_block_h_override = constants::TILE_HEIGHT;
     }
+
+    if (conv_config.act_block_w_div == 1 && conv_config.shard_layout == TensorMemoryLayout::WIDTH_SHARDED) {
+        uint32_t width_sharded_num_cores = determine_parallel_config(
+                                               TensorMemoryLayout::WIDTH_SHARDED,
+                                               batch_size,
+                                               in_channels,
+                                               output_height,
+                                               output_width,
+                                               out_channels,
+                                               compute_grid_size,
+                                               shard_orientation,
+                                               !is_mm_conv)
+                                               .grid.num_cores();
+        // Set act_block_w_div to max value to
+        // be conservative with L1 memory usage.
+        // act_block_w_div == 1 is currently the default value.
+        conv_config.act_block_w_div =
+            tt::div_up(in_channels, width_sharded_num_cores * constants::TILE_WIDTH);
+    }
 }
 
 template <typename T>

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp
@@ -41,8 +41,8 @@ struct Conv2dConfig {
     bool deallocate_activation = false;
     bool reallocate_halo_output = false;
     uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED.
-    uint32_t act_block_w_div = 1; //Amount by which the maximum possible act_block_width is divided. Max act_block_w = (in_channels * window_w * window_h)/total_num_cores;
-                                  //Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED
+    uint32_t act_block_w_div = 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / (total_num_cores * TILE_WIDTH);
+                                  // Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED
     bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true
     bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true
     std::optional<TensorMemoryLayout> shard_layout;