From 16ff6f03e21c6db5bca42f0883b1921cdd154d54 Mon Sep 17 00:00:00 2001 From: Pavle Josipovic Date: Thu, 19 Dec 2024 11:44:52 +0000 Subject: [PATCH] Conv2dConfig reallocate_halo_output default to true Idea behind this flag is to enable reallocation of halo output buffer in place of input buffer if case deallocate_activation flag is also set. In this case halo output is moved up and memory fragmentation is reduced. In case deallocate_activation is not set reallocate_halo_output won't have an effect as ttnn::move will be a no-op since input buffer is not deallocated. Current problem is that if user sets deallocate_activation it doesn't help them with memory issue unless they set reallocate_halo_output as well. This change is addressing this by setting reallocate_halo_output to true by default. This helps by increassing pass rate in ttnn torch traces. --- .../sweeps/conv2d/short/conv2d_short_sweep.py | 5 ----- .../ttnn/operations/conv/conv2d/conv2d_utils.hpp | 16 ++++++++-------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py index 56d9790840f..55fb094e4ec 100644 --- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py +++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py @@ -1620,20 +1620,15 @@ def test_conv2d_localrun(device, input_spec): [1, 960, 960, 24, 24, 5, 5, 1, 1, 2, 2, 960, False, 1], # 5 [1, 816, 816, 19, 19, 5, 5, 1, 1, 2, 2, 816, False, 1], # 19 [1, 816, 816, 23, 23, 5, 5, 2, 2, 0, 0, 816, False, 1], # 20 - [1, 1056, 1056, 96, 96, 3, 3, 2, 2, 1, 1, 4, False, 1], # 127 [1, 528, 528, 192, 192, 3, 3, 2, 2, 1, 1, 2, False, 1], # 220 - [1, 2904, 2904, 48, 48, 3, 3, 2, 2, 1, 1, 11, False, 1], # 294 [1, 819, 256, 100, 136, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1443 [1, 819, 256, 50, 68, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1447 [1, 1024, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1], # 1458 [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, False, 1], # 1460 [1, 768, 3, 224, 224, 32, 32, 32, 32, 0, 0, 1, True, 1], # 1461 [1, 768, 3, 384, 512, 32, 32, 32, 32, 0, 0, 1, True, 1], # 1464 - [1, 64, 3, 720, 1280, 7, 7, 2, 2, 3, 3, 1, False, 1], # 1471 - [1, 64, 3, 800, 1088, 7, 7, 2, 2, 3, 3, 1, False, 1], # 1472 [1, 1, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1495 [1, 64, 64, 480, 640, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1496 - [1, 640, 1920, 32, 32, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1522 [1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1530 [1, 320, 640, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1540 [1, 320, 960, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1], # 1545 diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp index 389e3bf736d..b6b894ed0df 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp @@ -24,14 +24,14 @@ struct Conv2dConfig { DataType weights_dtype = DataType::BFLOAT16; string activation = ""; uint32_t input_channels_alignment = 32; - bool deallocate_activation = false; - bool reallocate_halo_output = false; - uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED. - uint32_t act_block_w_div = - 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / - // (total_num_cores * TILE_WIDTH); Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED - bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true - bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true + bool deallocate_activation = false; // If user tensor will be deallocated if it's on device. + bool reallocate_halo_output = true; // If true after halo device op is done, the output tensor will be reallocated. + // in case deallocate_activation is set to true. + uint32_t act_block_h_override = 0; // This argument is ignored when shard_layout == WIDTH_SHARDED. + uint32_t act_block_w_div = 1; // Amount by which the maximum possible act_block_width is divided. Max act_block_w = in_channels / (total_num_cores * TILE_WIDTH); + // Ignored when shard_layout == HEIGHT_SHARDED or BLOCK_SHARDED + bool reshard_if_not_optimal = false; // if true, override_sharding_config should not be set to true + bool override_sharding_config = false; // if true, reshard_if_not_optimal should not be set to true std::optional shard_layout = std::nullopt; std::optional core_grid = std::nullopt; // used only if override_sharding_config is true bool transpose_shards = true; // used only if override_sharding_config is true and if height sharding is false