From 16ff6f03e21c6db5bca42f0883b1921cdd154d54 Mon Sep 17 00:00:00 2001
From: Pavle Josipovic <pjosipovic@tenstorrent.com>
Date: Thu, 19 Dec 2024 11:44:52 +0000
Subject: [PATCH] Conv2dConfig reallocate_halo_output default to true

Idea behind this flag is to enable reallocation of
halo output buffer in place of input buffer if case
deallocate_activation flag is also set.
In this case halo output is moved up and memory
fragmentation is reduced.

In case deallocate_activation is not set reallocate_halo_output
won't have an effect as ttnn::move will be a no-op
since input buffer is not deallocated.

Current problem is that if user sets deallocate_activation
it doesn't help them with memory issue unless they
set reallocate_halo_output as well.

This change is addressing this by setting reallocate_halo_output
to true by default.

This helps by increassing pass rate in ttnn torch
traces.
---
 .../sweeps/conv2d/short/conv2d_short_sweep.py    |  5 -----
 .../ttnn/operations/conv/conv2d/conv2d_utils.hpp | 16 ++++++++--------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
index 56d9790840f..55fb094e4ec 100644
--- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
+++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
@@ -1620,20 +1620,15 @@ def test_conv2d_localrun(device, input_spec):
     [1, 960, 960, 24, 24, 5, 5, 1, 1, 2, 2, 960, False, 1],  # 5
     [1, 816, 816, 19, 19, 5, 5, 1, 1, 2, 2, 816, False, 1],  # 19
     [1, 816, 816, 23, 23, 5, 5, 2, 2, 0, 0, 816, False, 1],  # 20
-    [1, 1056, 1056, 96, 96, 3, 3, 2, 2, 1, 1, 4, False, 1],  # 127
     [1, 528, 528, 192, 192, 3, 3, 2, 2, 1, 1, 2, False, 1],  # 220
-    [1, 2904, 2904, 48, 48, 3, 3, 2, 2, 1, 1, 11, False, 1],  # 294
     [1, 819, 256, 100, 136, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1443
     [1, 819, 256, 50, 68, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1447
     [1, 1024, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 1458
     [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, False, 1],  # 1460
     [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 1461
     [1, 768, 3, 384, 512, 32, 32, 32, 32, 0, 0, 1, True, 1],  # 1464
-    [1, 64, 3, 720, 1280, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 1471
-    [1, 64, 3, 800, 1088, 7, 7, 2, 2, 3, 3, 1, False, 1],  # 1472
     [1, 1, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1495
     [1, 64, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1496
-    [1, 640, 1920, 32, 32, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1522
     [1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1530
     [1, 320, 640, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1540
     [1, 320, 960, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1],  # 1545
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
index 389e3bf736d..b6b894ed0df 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
@@ -24,14 +24,14 @@ struct Conv2dConfig {
     DataType weights_dtype = DataType::BFLOAT16;
     string activation = "";
     uint32_t input_channels_alignment = 32;
-    bool deallocate_activation = false;
-    bool reallocate_halo_output = false;
-    uint32_t act_block_h_override = 0;  // This argument is ignored when shard_layout == WIDTH_SHARDED.
-    uint32_t act_block_w_div =
-        1;  // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels /
-            // (total_num_cores * TILE_WIDTH); Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED
-    bool reshard_if_not_optimal = false;    // if true, override_sharding_config should not be set to true
-    bool override_sharding_config = false;  // if true, reshard_if_not_optimal should not be set to true
+    bool deallocate_activation = false; // If user tensor will be deallocated if it's on device.
+    bool reallocate_halo_output = true; // If true after halo device op is done, the output tensor will be reallocated.
+                                        // in case deallocate_activation is set to true.
+    uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED.
+    uint32_t act_block_w_div = 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / (total_num_cores * TILE_WIDTH);
+                                  // Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED
+    bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true
+    bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true
     std::optional<TensorMemoryLayout> shard_layout = std::nullopt;
     std::optional<CoreRangeSet> core_grid = std::nullopt;  // used only if override_sharding_config is true
     bool transpose_shards = true;  // used only if override_sharding_config is true and if height sharding is false