#4625: Enable multicore support for untilize with unpadding on interl…

…eaved tensors This commit adds the ability to run the untilize_with_unpadding op on interleaved tensors across multiple cores.
tenstorrent · May 17, 2024 · 61ae60a · 61ae60a
1 parent 87a78d6
commit 61ae60a
Show file tree

Hide file tree

Showing 20 changed files with 806 additions and 529 deletions.
diff --git a/models/demos/resnet/tt/metalResnetBlock50.py b/models/demos/resnet/tt/metalResnetBlock50.py
@@ -2217,7 +2217,6 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
         unpadded_shape = x.shape_without_padding()
         x = tt_lib.tensor.untilize_with_unpadding(
             x,
-            (0, 0, 0, 0),
             (unpadded_shape[0] - 1, unpadded_shape[1] - 1, unpadded_shape[2] - 1, unpadded_shape[3] - 1),
             self.memory_config,
         )
@@ -2274,7 +2273,7 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
         ]
         if self.sharded:
             x = tt_lib.tensor.untilize_with_unpadding(
-                x, (0, 0, 0, 0), unpadded_shape_end, output_mem_config=self.width_sharded_memory_config
+                x, unpadded_shape_end, output_mem_config=self.width_sharded_memory_config
             )
         else:
             x = tt_lib.tensor.untilize(x, self.memory_config, use_multicore=True)
@@ -2313,7 +2312,6 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
         desired_shape[-1] = 1000
         x = tt_lib.tensor.untilize_with_unpadding(
             x,
-            [0, 0, 0, 0],
             (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1),
             self.memory_config,
         )

diff --git a/models/experimental/resnet/tt/ttnn_functional_resnet50.py b/models/experimental/resnet/tt/ttnn_functional_resnet50.py
@@ -676,7 +676,6 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
         unpadded_shape = x.shape_without_padding()
         x = ttnn.experimental.tensor.untilize_with_unpadding(
             x,
-            (0, 0, 0, 0),
             (unpadded_shape[0] - 1, unpadded_shape[1] - 1, unpadded_shape[2] - 1, unpadded_shape[3] - 1),
             ttnn.L1_MEMORY_CONFIG,
         )
@@ -735,7 +734,7 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
             x.get_legacy_shape()[3] - 1,
         ]
         x = ttnn.experimental.tensor.untilize_with_unpadding(
-            x, (0, 0, 0, 0), unpadded_shape_end, output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG
+            x, unpadded_shape_end, output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG
         )
 
         x = ttnn.reshape(
@@ -763,7 +762,6 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
         desired_shape[-1] = 1000
         x = ttnn.experimental.tensor.untilize_with_unpadding(
             x,
-            [0, 0, 0, 0],
             (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1),
             ttnn.L1_MEMORY_CONFIG,
         )

diff --git a/..._eager/python_api_testing/non_working_unit_tests/wormhole/test_untilize_with_unpadding.py b/..._eager/python_api_testing/non_working_unit_tests/wormhole/test_untilize_with_unpadding.py
@@ -25,7 +25,6 @@ def run_untilize_with_unpadding_tests(
     in_mem_config,
     out_mem_config,
     data_seed,
-    output_tensor_start,
     output_tensor_end,
     device,
 ):
@@ -38,12 +37,11 @@ def run_untilize_with_unpadding_tests(
     # compute ref value
     x_ref = x.detach().clone()
     ref_value = pytorch_ops.untilize_with_unpadding(
-        x_ref, output_tensor_start=output_tensor_start, output_tensor_end=output_tensor_end
+        x_ref, output_tensor_end=output_tensor_end
     )
 
     tt_result = tt_untilize_with_unpadding(
         x=x,
-        output_tensor_start=output_tensor_start,
         output_tensor_end=output_tensor_end,
         device=device,
         dtype=[dtype],
@@ -68,14 +66,13 @@ def run_untilize_with_unpadding_tests(
         "SYSTEM_MEMORY",
         ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM),
         5263366,
-        [0, 0, 0, 0],
         [10, 9, 4, 1],
     ),
 ]
 
 
 @pytest.mark.parametrize(
-    "input_shape, dtype, dlayout, in_mem_config, out_mem_config, data_seed, output_tensor_start, output_tensor_end",
+    "input_shape, dtype, dlayout, in_mem_config, out_mem_config, data_seed, output_tensor_end",
     (test_sweep_args),
 )
 def test_untilize_with_unpadding_test(
@@ -85,7 +82,6 @@ def test_untilize_with_unpadding_test(
     in_mem_config,
     out_mem_config,
     data_seed,
-    output_tensor_start,
     output_tensor_end,
     device,
 ):
@@ -97,7 +93,6 @@ def test_untilize_with_unpadding_test(
         in_mem_config,
         out_mem_config,
         data_seed,
-        output_tensor_start,
         output_tensor_end,
         device,
     )
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py b/tests/tt_eager/python_api_testing/sweep_tests/generation_funcs.py
@@ -823,14 +823,13 @@ def gen_untilize_with_unpadding_args(
         input_shapes, dtypes, layouts, mem_configs, do_sanitize_args=do_sanitize_args
     ):
         if input_info is not None:
-            output_tensor_start = [0, 0, 0, 0]
-            output_tensor_end = [random.randrange(output_tensor_start[i], input_shapes[0][i], 1) for i in range(4)]
+            output_tensor_end = [random.randrange(0, input_shapes[0][i], 1) for i in range(4)]
             if output_tensor_end[-1] % 2 == 0:
                 output_tensor_end[-1] += 1
             input_info.update(
                 {
-                    "output_tensor_start": output_tensor_start,
                     "output_tensor_end": output_tensor_end,
+                    "use_multicore": True,
                 }
             )
             yield input_info

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_untilize_with_unpadding.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_untilize_with_unpadding.py
@@ -38,7 +38,6 @@ def create_grid(x, y):
             "output_mem_config": ttl.tensor.MemoryConfig(
                 ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM
             ),
-            "output_tensor_start": [0, 0, 0, 0],
             "output_tensor_end": [0, 0, 119, 7299],
         },
     )

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -1200,13 +1200,13 @@ def tilize_with_val_padding(x, output_tensor_shape, pad_value, *args, **kwargs):
     return tilized
 
 
-def untilize_with_unpadding(x, output_tensor_start, output_tensor_end, *args, **kwargs):
+def untilize_with_unpadding(x, output_tensor_end, *args, **kwargs):
     untilized = untilize_util(x)
     unpad = untilized[
-        output_tensor_start[0] : output_tensor_end[0] + 1,
-        output_tensor_start[1] : output_tensor_end[1] + 1,
-        output_tensor_start[2] : output_tensor_end[2] + 1,
-        output_tensor_start[3] : output_tensor_end[3] + 1,
+        : output_tensor_end[0] + 1,
+        : output_tensor_end[1] + 1,
+        : output_tensor_end[2] + 1,
+        : output_tensor_end[3] + 1,
     ]
     return unpad
 

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
@@ -2063,7 +2063,6 @@ def untilize_with_unpadding(
     layout,
     input_mem_config,
     output_mem_config,
-    output_tensor_start,
     output_tensor_end,
     **kwargs,
 ):
@@ -2085,7 +2084,7 @@ def untilize_with_unpadding(
         )
 
     t1 = ttl.tensor.untilize_with_unpadding(
-        t0, output_tensor_start, output_tensor_end, output_mem_config=output_mem_config
+        t0, output_tensor_end, output_mem_config=output_mem_config
     )
 
     return tt2torch_tensor(t1)

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -1655,7 +1655,6 @@ def test_block_sharded_untilize_with_unpadding(in_sharded, out_sharded, dtype, d
 
     yt = ttl.tensor.untilize_with_unpadding(
         xt,
-        ttl.tensor.Shape([0, 0, 0, 0]),
         ttl.tensor.Shape([0, 0, 391, 511]),
         output_mem_config=out_mem_config,
     )
@@ -1744,7 +1743,6 @@ def test_width_sharded_untilize_with_unpadding(
 
     yt = ttl.tensor.untilize_with_unpadding(
         xt,
-        ttl.tensor.Shape([0, 0, 0, 0]),
         ttl.tensor.Shape([N - 1, C - 1, output_H - 1, W - 1]),
         output_mem_config=out_mem_config,
     )

diff --git a/tt_eager/tt_dnn/op_library/auto_format.cpp b/tt_eager/tt_dnn/op_library/auto_format.cpp
@@ -153,7 +153,6 @@ Tensor AutoFormat::format_output_tensor(
             } else if (formatted_output.get_layout() == Layout::TILE && AutoFormat::legal_rm_shape(shape)) {
                 formatted_output = untilize_with_unpadding(
                     formatted_output,
-                    {0, 0, 0, 0},
                     {shape[0] - 1, shape[1] - 1, shape[2] - 1, shape[3] - 1},
                     mem_config);
                 return formatted_output;
@@ -163,7 +162,6 @@ Tensor AutoFormat::format_output_tensor(
                 AutoFormat::legal_rm_shape(shape)) {
                 formatted_output = untilize_with_unpadding(
                     formatted_output,
-                    {0, 0, 0, 0},
                     {shape[0] - 1, shape[1] - 1, shape[2] - 1, shape[3] - 1},
                     mem_config);
                 return formatted_output;

diff --git a/.../tt_dnn/op_library/tilize/kernels/dataflow/reader_unary_pad_dims_split_rows_multicore.cpp b/.../tt_dnn/op_library/tilize/kernels/dataflow/reader_unary_pad_dims_split_rows_multicore.cpp
@@ -53,9 +53,8 @@ void kernel_main() {
 
         cb_reserve_back(cb_id_in0, num_tiles_per_row * has_rows);
         uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
-        uint32_t curr_stick_id = base_stick_id;
         for (uint32_t k = 0; k < num_rows; k++) {
-            uint64_t src_noc_addr = get_noc_addr(curr_stick_id + k, s);
+            uint64_t src_noc_addr = get_noc_addr(base_stick_id + k, s);
 
             // Read from DRAM to tmp buffer
             noc_async_read(src_noc_addr, l1_write_addr, unpadded_X_size);

diff --git a/tt_eager/tt_dnn/op_library/tilize/tilize_multi_core/padding.h b/tt_eager/tt_dnn/op_library/tilize/tilize_multi_core/padding.h