Skip to content

Commit

Permalink
#4625: Enable multicore support for untilize with unpadding on interl…
Browse files Browse the repository at this point in the history
…eaved tensors

This commit adds the ability to run the untilize_with_unpadding op on interleaved tensors across multiple cores.
  • Loading branch information
yan-zaretskiy committed May 17, 2024
1 parent 87a78d6 commit 61ae60a
Show file tree
Hide file tree
Showing 20 changed files with 806 additions and 529 deletions.
4 changes: 1 addition & 3 deletions models/demos/resnet/tt/metalResnetBlock50.py
Original file line number Diff line number Diff line change
Expand Up @@ -2217,7 +2217,6 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
unpadded_shape = x.shape_without_padding()
x = tt_lib.tensor.untilize_with_unpadding(
x,
(0, 0, 0, 0),
(unpadded_shape[0] - 1, unpadded_shape[1] - 1, unpadded_shape[2] - 1, unpadded_shape[3] - 1),
self.memory_config,
)
Expand Down Expand Up @@ -2274,7 +2273,7 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
]
if self.sharded:
x = tt_lib.tensor.untilize_with_unpadding(
x, (0, 0, 0, 0), unpadded_shape_end, output_mem_config=self.width_sharded_memory_config
x, unpadded_shape_end, output_mem_config=self.width_sharded_memory_config
)
else:
x = tt_lib.tensor.untilize(x, self.memory_config, use_multicore=True)
Expand Down Expand Up @@ -2313,7 +2312,6 @@ def forward(self, x: tt_lib.tensor) -> tt_lib.tensor:
desired_shape[-1] = 1000
x = tt_lib.tensor.untilize_with_unpadding(
x,
[0, 0, 0, 0],
(desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1),
self.memory_config,
)
Expand Down
4 changes: 1 addition & 3 deletions models/experimental/resnet/tt/ttnn_functional_resnet50.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,6 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
unpadded_shape = x.shape_without_padding()
x = ttnn.experimental.tensor.untilize_with_unpadding(
x,
(0, 0, 0, 0),
(unpadded_shape[0] - 1, unpadded_shape[1] - 1, unpadded_shape[2] - 1, unpadded_shape[3] - 1),
ttnn.L1_MEMORY_CONFIG,
)
Expand Down Expand Up @@ -735,7 +734,7 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
x.get_legacy_shape()[3] - 1,
]
x = ttnn.experimental.tensor.untilize_with_unpadding(
x, (0, 0, 0, 0), unpadded_shape_end, output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG
x, unpadded_shape_end, output_mem_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG
)

x = ttnn.reshape(
Expand Down Expand Up @@ -763,7 +762,6 @@ def __call__(self, input_tensor) -> ttnn.Tensor:
desired_shape[-1] = 1000
x = ttnn.experimental.tensor.untilize_with_unpadding(
x,
[0, 0, 0, 0],
(desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1),
ttnn.L1_MEMORY_CONFIG,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def run_untilize_with_unpadding_tests(
in_mem_config,
out_mem_config,
data_seed,
output_tensor_start,
output_tensor_end,
device,
):
Expand All @@ -38,12 +37,11 @@ def run_untilize_with_unpadding_tests(
# compute ref value
x_ref = x.detach().clone()
ref_value = pytorch_ops.untilize_with_unpadding(
x_ref, output_tensor_start=output_tensor_start, output_tensor_end=output_tensor_end
x_ref, output_tensor_end=output_tensor_end
)

tt_result = tt_untilize_with_unpadding(
x=x,
output_tensor_start=output_tensor_start,
output_tensor_end=output_tensor_end,
device=device,
dtype=[dtype],
Expand All @@ -68,14 +66,13 @@ def run_untilize_with_unpadding_tests(
"SYSTEM_MEMORY",
ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM),
5263366,
[0, 0, 0, 0],
[10, 9, 4, 1],
),
]


@pytest.mark.parametrize(
"input_shape, dtype, dlayout, in_mem_config, out_mem_config, data_seed, output_tensor_start, output_tensor_end",
"input_shape, dtype, dlayout, in_mem_config, out_mem_config, data_seed, output_tensor_end",
(test_sweep_args),
)
def test_untilize_with_unpadding_test(
Expand All @@ -85,7 +82,6 @@ def test_untilize_with_unpadding_test(
in_mem_config,
out_mem_config,
data_seed,
output_tensor_start,
output_tensor_end,
device,
):
Expand All @@ -97,7 +93,6 @@ def test_untilize_with_unpadding_test(
in_mem_config,
out_mem_config,
data_seed,
output_tensor_start,
output_tensor_end,
device,
)
Original file line number Diff line number Diff line change
Expand Up @@ -823,14 +823,13 @@ def gen_untilize_with_unpadding_args(
input_shapes, dtypes, layouts, mem_configs, do_sanitize_args=do_sanitize_args
):
if input_info is not None:
output_tensor_start = [0, 0, 0, 0]
output_tensor_end = [random.randrange(output_tensor_start[i], input_shapes[0][i], 1) for i in range(4)]
output_tensor_end = [random.randrange(0, input_shapes[0][i], 1) for i in range(4)]
if output_tensor_end[-1] % 2 == 0:
output_tensor_end[-1] += 1
input_info.update(
{
"output_tensor_start": output_tensor_start,
"output_tensor_end": output_tensor_end,
"use_multicore": True,
}
)
yield input_info
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def create_grid(x, y):
"output_mem_config": ttl.tensor.MemoryConfig(
ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM
),
"output_tensor_start": [0, 0, 0, 0],
"output_tensor_end": [0, 0, 119, 7299],
},
)
Expand Down
10 changes: 5 additions & 5 deletions tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,13 +1200,13 @@ def tilize_with_val_padding(x, output_tensor_shape, pad_value, *args, **kwargs):
return tilized


def untilize_with_unpadding(x, output_tensor_start, output_tensor_end, *args, **kwargs):
def untilize_with_unpadding(x, output_tensor_end, *args, **kwargs):
untilized = untilize_util(x)
unpad = untilized[
output_tensor_start[0] : output_tensor_end[0] + 1,
output_tensor_start[1] : output_tensor_end[1] + 1,
output_tensor_start[2] : output_tensor_end[2] + 1,
output_tensor_start[3] : output_tensor_end[3] + 1,
: output_tensor_end[0] + 1,
: output_tensor_end[1] + 1,
: output_tensor_end[2] + 1,
: output_tensor_end[3] + 1,
]
return unpad

Expand Down
3 changes: 1 addition & 2 deletions tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2063,7 +2063,6 @@ def untilize_with_unpadding(
layout,
input_mem_config,
output_mem_config,
output_tensor_start,
output_tensor_end,
**kwargs,
):
Expand All @@ -2085,7 +2084,7 @@ def untilize_with_unpadding(
)

t1 = ttl.tensor.untilize_with_unpadding(
t0, output_tensor_start, output_tensor_end, output_mem_config=output_mem_config
t0, output_tensor_end, output_mem_config=output_mem_config
)

return tt2torch_tensor(t1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1655,7 +1655,6 @@ def test_block_sharded_untilize_with_unpadding(in_sharded, out_sharded, dtype, d

yt = ttl.tensor.untilize_with_unpadding(
xt,
ttl.tensor.Shape([0, 0, 0, 0]),
ttl.tensor.Shape([0, 0, 391, 511]),
output_mem_config=out_mem_config,
)
Expand Down Expand Up @@ -1744,7 +1743,6 @@ def test_width_sharded_untilize_with_unpadding(

yt = ttl.tensor.untilize_with_unpadding(
xt,
ttl.tensor.Shape([0, 0, 0, 0]),
ttl.tensor.Shape([N - 1, C - 1, output_H - 1, W - 1]),
output_mem_config=out_mem_config,
)
Expand Down
2 changes: 0 additions & 2 deletions tt_eager/tt_dnn/op_library/auto_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ Tensor AutoFormat::format_output_tensor(
} else if (formatted_output.get_layout() == Layout::TILE && AutoFormat::legal_rm_shape(shape)) {
formatted_output = untilize_with_unpadding(
formatted_output,
{0, 0, 0, 0},
{shape[0] - 1, shape[1] - 1, shape[2] - 1, shape[3] - 1},
mem_config);
return formatted_output;
Expand All @@ -163,7 +162,6 @@ Tensor AutoFormat::format_output_tensor(
AutoFormat::legal_rm_shape(shape)) {
formatted_output = untilize_with_unpadding(
formatted_output,
{0, 0, 0, 0},
{shape[0] - 1, shape[1] - 1, shape[2] - 1, shape[3] - 1},
mem_config);
return formatted_output;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ void kernel_main() {

cb_reserve_back(cb_id_in0, num_tiles_per_row * has_rows);
uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
uint32_t curr_stick_id = base_stick_id;
for (uint32_t k = 0; k < num_rows; k++) {
uint64_t src_noc_addr = get_noc_addr(curr_stick_id + k, s);
uint64_t src_noc_addr = get_noc_addr(base_stick_id + k, s);

// Read from DRAM to tmp buffer
noc_async_read(src_noc_addr, l1_write_addr, unpadded_X_size);
Expand Down
113 changes: 0 additions & 113 deletions tt_eager/tt_dnn/op_library/tilize/tilize_multi_core/padding.h

This file was deleted.

Loading

0 comments on commit 61ae60a

Please sign in to comment.