From e7ef646e9c0d4ed304a9438cbcc107331de785b6 Mon Sep 17 00:00:00 2001 From: VirdhatchaniKN Date: Wed, 21 Feb 2024 06:08:03 +0000 Subject: [PATCH] #0: Add PROD support --- docs/source/ttnn/ttnn/dependencies/tt_lib.rst | 4 + .../python_api_testing/sweep_tests/op_map.py | 8 +- .../pytests/tt_dnn/test_composite.py | 115 ++++--- .../sweep_tests/pytests/tt_dnn/test_prod.py | 75 +++++ .../sweep_tests/pytorch_ops.py | 7 +- .../sweep_tests/tt_lib_ops.py | 23 +- .../backward_ops/test_backward_prod.py | 64 ++++ .../backward_ops/utility_funcs.py | 33 ++ .../unit_testing/test_prod.py | 105 ------ .../unit_testing/test_prod_all.py | 78 +++++ .../unit_testing/test_prod_nc.py | 90 ++++++ tt_eager/tt_dnn/module.mk | 2 + .../op_library/backward/backward_ops.cpp | 107 +++++++ .../op_library/backward/backward_ops.hpp | 2 + .../op_library/composite/composite_ops.cpp | 82 ++++- .../op_library/composite/composite_ops.hpp | 5 +- .../eltwise_unary/eltwise_unary_op.cpp | 2 + .../eltwise_unary/eltwise_unary_op.hpp | 4 +- .../prod/kernels/compute/prod_all.cpp | 71 +++++ .../prod/kernels/{ => compute}/prod_nc.cpp | 9 +- .../kernels/{ => dataflow}/reader_prod_nc.cpp | 38 ++- .../prod/kernels/dataflow/utils.hpp | 16 + .../tt_dnn/op_library/prod/kernels/utils.hpp | 298 ------------------ .../prod/kernels/writer_prod_nc.cpp | 39 --- .../op_library/prod/prod_nc/prod_nc.cpp | 22 +- .../tt_dnn/op_library/prod/prod_nc_op.cpp | 26 +- .../tt_dnn/op_library/prod/prod_nc_op.hpp | 2 +- .../tt_dnn/op_library/prod/prod_op_all.cpp | 60 ++++ .../tt_dnn/op_library/prod/prod_op_all.hpp | 45 +++ .../single_core/prod_op_all_single_core.cpp | 137 ++++++++ .../tt_lib/csrc/operations/primary/module.hpp | 9 + .../tt_lib_bindings_tensor_backward_ops.cpp | 19 ++ .../tt_lib_bindings_tensor_composite_ops.cpp | 18 +- .../csrc/tt_lib_bindings_tensor_xary_ops.cpp | 1 + tt_eager/tt_numpy/functions.hpp | 114 +++++++ tt_metal/common/bfloat16.hpp | 3 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 1 + .../llk_sfpu/ckernel_sfpu_tiled_prod.h | 34 ++ .../llk_math_eltwise_unary_sfpu_tiled_prod.h | 28 ++ .../grayskull/metal/llk_api/llk_sfpu_types.h | 1 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 1 + .../llk_sfpu/ckernel_sfpu_tiled_prod.h | 34 ++ .../llk_math_eltwise_unary_sfpu_tiled_prod.h | 28 ++ .../metal/llk_api/llk_sfpu_types.h | 1 + tt_metal/include/compute_kernel_api.h | 24 ++ 45 files changed, 1320 insertions(+), 565 deletions(-) create mode 100644 tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_prod.py create mode 100644 tests/tt_eager/python_api_testing/unit_testing/backward_ops/test_backward_prod.py delete mode 100644 tests/tt_eager/python_api_testing/unit_testing/test_prod.py create mode 100644 tests/tt_eager/python_api_testing/unit_testing/test_prod_all.py create mode 100644 tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py create mode 100644 tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_all.cpp rename tt_eager/tt_dnn/op_library/prod/kernels/{ => compute}/prod_nc.cpp (92%) rename tt_eager/tt_dnn/op_library/prod/kernels/{ => dataflow}/reader_prod_nc.cpp (58%) create mode 100644 tt_eager/tt_dnn/op_library/prod/kernels/dataflow/utils.hpp delete mode 100644 tt_eager/tt_dnn/op_library/prod/kernels/utils.hpp delete mode 100644 tt_eager/tt_dnn/op_library/prod/kernels/writer_prod_nc.cpp create mode 100644 tt_eager/tt_dnn/op_library/prod/prod_op_all.cpp create mode 100644 tt_eager/tt_dnn/op_library/prod/prod_op_all.hpp create mode 100644 tt_eager/tt_dnn/op_library/prod/single_core/prod_op_all_single_core.cpp create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h create mode 100644 tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst index 76ecefdff659..1238f5e9f8cb 100644 --- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst +++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst @@ -785,6 +785,8 @@ Other Operations .. autofunction:: tt_lib.tensor.prod +.. autofunction:: tt_lib.tensor.tiled_prod + .. autofunction:: tt_lib.tensor.addcmul .. autofunction:: tt_lib.tensor.addcdiv @@ -830,6 +832,8 @@ Other Operations Backward Operations =================== +.. autofunction:: tt_lib.tensor.prod_bw + .. autofunction:: tt_lib.tensor.addalpha_bw .. autofunction:: tt_lib.tensor.addcmul_bw diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py index 6253efde62df..3ea51acf6406 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py @@ -48,6 +48,10 @@ "tt_lib_op": tt_lib_ops.arange, "pytorch_op": pytorch_ops.arange, }, + "prod": { + "tt_lib_op": tt_lib_ops.prod, + "pytorch_op": pytorch_ops.prod, + }, # stats "stats-var_hw": { "tt_lib_op": tt_lib_ops.var_hw, @@ -590,10 +594,6 @@ "tt_lib_op": tt_lib_ops.eltwise_subalpha, "pytorch_op": pytorch_ops.subalpha, }, - "eltwise-prod": { - "tt_lib_op": tt_lib_ops.eltwise_prod, - "pytorch_op": pytorch_ops.prod, - }, "eltwise-addalpha": { "tt_lib_op": tt_lib_ops.eltwise_addalpha, "pytorch_op": pytorch_ops.addalpha, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py index a17cb6fb241b..b6fe77e640d3 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_composite.py @@ -49,7 +49,7 @@ def custom_compare(*args, **kwargs): return result -shapes = ([[1, 1, 32, 32]], [[4, 3, 32, 64]]) +shapes = ([[1, 1, 32, 32]], [[1, 3, 320, 64]]) if is_wormhole_b0(): shapes = (shapes[0],) @@ -60,59 +60,58 @@ def custom_compare(*args, **kwargs): list( product( ( - # "lerp_binary", - # "lerp_ternary", - # "addcmul", - # "addcdiv", - # "min", - # "max", - # "swish", - # "log1p", - # "softplus", - # "mish", - # "silu", - # "polyval", - # "mac", - # "cbrt", - # "threshold", - # "hypot", - # "hardswish", - # "hardsigmoid", - # "ones_like", - # "zeros_like", - # "full_like", - # "ones", - # "empty", - # "zeros", - # "full", - # "arange", - # "hardshrink", - # "softshrink", - # "sinh", - # "cosh", - # "tanhshrink", - # "xlogy", - # "asinh", - # "acosh", - # "atanh", - # "atan2", - # "subalpha", - # "bias_gelu_unary", - # "addalpha", - # "logit", - # "logical_ori", - # "logical_xor", - # "logical_xori", - # "logical_noti", - # "logical_andi", - # "isclose", - # "digamma", - # "lgamma", - # "multigammaln", - # "polygamma", - # "nextafter", - # "scatter", - "prod", + "lerp_binary", + "lerp_ternary", + "addcmul", + "addcdiv", + "min", + "max", + "swish", + "log1p", + "softplus", + "mish", + "silu", + "polyval", + "mac", + "cbrt", + "threshold", + "hypot", + "hardswish", + "hardsigmoid", + "ones_like", + "zeros_like", + "full_like", + "ones", + "empty", + "zeros", + "full", + "arange", + "hardshrink", + "softshrink", + "sinh", + "cosh", + "tanhshrink", + "xlogy", + "asinh", + "acosh", + "atanh", + "atan2", + "subalpha", + "bias_gelu_unary", + "addalpha", + "logit", + "logical_ori", + "logical_xor", + "logical_xori", + "logical_noti", + "logical_andi", + "isclose", + "digamma", + "lgamma", + "multigammaln", + "polygamma", + "nextafter", + "scatter", ), shapes, ) @@ -129,7 +128,6 @@ def test_run_eltwise_composite_test(fn, input_shapes, device, function_level_def options["hypot"] = (1, 100) options["atan2"] = (-100, 100) options["cbrt"] = (-1000, 1000) - options["prod"] = (1, 1.5) options["hardsigmoid"] = (-100, 100) options["hardswish"] = (-100, 100) options["hardshrink"] = (-100, 100) @@ -164,13 +162,6 @@ def test_run_eltwise_composite_test(fn, input_shapes, device, function_level_def torch.int32, ) ] - elif fn in ["prod"]: # "prod_cpu" not implemented for 'BFloat16' - datagen_func = [ - generation_funcs.gen_func_with_cast( - partial(generator, low=options[fn][0], high=options[fn][1]), - torch.float32, - ) - ] else: datagen_func = [ generation_funcs.gen_func_with_cast( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_prod.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_prod.py new file mode 100644 index 000000000000..8ba79b93102e --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_prod.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +import random +from functools import partial +import tt_lib as ttl + + +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "dim", + (3, 2, 1, 0, -1, -2, -3, -4), +) +@pytest.mark.parametrize("all_dimensions", [False, True]) +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32]], + [[4, 3, 32, 32]], + [[2, 2, 32, 32]], + # [[6, 4, 32, 32]], #Fails for all_dimensions = True ( expected result is inf but the result generated in nan ) + # [[1, 1, 320, 320]], #Fails for all_dimensions = True ( expected result is inf but the result generated in nan ) + # [[1, 3, 320, 64]], #Fails for all_dimensions = True ( expected result is inf but the result generated in nan ) + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +class TestProd: + def test_run_prod_op( + self, + all_dimensions, + dim, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=1, high=1.5), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update( + { + "all_dimensions": all_dimensions, + "dim": dim, + } + ) + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "prod", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index aec11ffdd014..f37a065243ea 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -863,8 +863,11 @@ def xlogy(x, y, *args, **kwargs): return torch.xlogy(x, y) -def prod(x, *args, **kwargs): - return torch.prod(x, 0) +def prod(x, *args, all_dimensions, dim, **kwargs): + if all_dimensions: + result = torch.prod(x) + return result.view(1, 1, 1, 1) + return torch.prod(x, dim, keepdim=True) def ldexp(x, y, *args, **kwargs): diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 19b006433a2a..672641c84e6d 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -1301,6 +1301,28 @@ def arange( return tt2torch_tensor(t1) +@setup_host_and_device +def prod( + x, + *args, + all_dimensions, + dim, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.prod(t0, all_dimensions, dim, output_mem_config=output_mem_config) + output = tt2torch_tensor(t1) + if all_dimensions: + return output[:1, :1, :1, :1] + else: + return output + + @setup_host_and_device def eltwise_logical_andi( x, @@ -2262,7 +2284,6 @@ def binary_op( eltwise_isneginf = make_unary_op(ttl.tensor.isneginf) eltwise_isnan = make_unary_op(ttl.tensor.isnan) eltwise_logical_not_unary = make_unary_op(ttl.tensor.logical_not_unary) -eltwise_prod = make_unary_op(ttl.tensor.prod) eltwise_i0 = make_unary_op(ttl.tensor.i0) ################################################ diff --git a/tests/tt_eager/python_api_testing/unit_testing/backward_ops/test_backward_prod.py b/tests/tt_eager/python_api_testing/unit_testing/backward_ops/test_backward_prod.py new file mode 100644 index 000000000000..d66971dacb97 --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/backward_ops/test_backward_prod.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import pytest +import tt_lib +from tests.tt_eager.python_api_testing.unit_testing.backward_ops.utility_funcs import ( + data_gen_pt_tt, + data_gen_pt_tt_prod, + compare_results, +) + + +@pytest.mark.parametrize( + "input_shapes", + ( + (torch.Size([1, 1, 32, 32])), # 0 + (torch.Size([1, 1, 320, 384])), # 1 + (torch.Size([4, 2, 32, 32])), # 2 + (torch.Size([1, 3, 320, 384])), # 3 + (torch.Size([4, 3, 32, 32])), # 4 + (torch.Size([4, 3, 64, 64])), # 5 + (torch.Size([4, 3, 320, 320])), # 6 + (torch.Size([4, 3, 32, 32])), # 7 + (torch.Size([1, 3, 320, 320])), # 8 + (torch.Size([1, 4, 320, 384])), # 9 + (torch.Size([4, 4, 32, 32])), # 10 + (torch.Size([5, 4, 32, 32])), # 11 + (torch.Size([6, 4, 32, 32])), # 12 + (torch.Size([4, 5, 32, 32])), # 13 + (torch.Size([4, 6, 32, 32])), # 14 + (torch.Size([4, 10, 32, 32])), # 15 + (torch.Size([4, 20, 32, 32])), # 16 + (torch.Size([4, 30, 32, 32])), # 17 + (torch.Size([4, 31, 32, 32])), # 18 + (torch.Size([4, 32, 32, 32])), # 19 + (torch.Size([4, 33, 32, 32])), # 20 + (torch.Size([4, 63, 32, 32])), # 21 + (torch.Size([4, 64, 32, 32])), # 22 + (torch.Size([32, 64, 32, 32])), # 23 + ), +) +@pytest.mark.parametrize( + "dim", + [-4, -3, -2, -1, 0, 1, 2, 3], +) +@pytest.mark.parametrize("all_dimensions", [True, False]) +def test_bw_prod(input_shapes, all_dimensions, dim, device): + in_data, input_tensor = data_gen_pt_tt(input_shapes, device, True) + grad_data, grad_tensor = data_gen_pt_tt_prod(input_shapes, device, all_dimensions, dim) + if all_dimensions == False: + pyt_y = torch.prod(in_data, dim=dim, keepdim=True) + else: + pyt_y = torch.prod(in_data).view(1, 1, 1, 1) + tt_output_tensor_on_device = tt_lib.tensor.prod_bw(grad_tensor, input_tensor, all_dimensions, dim) + in_data.retain_grad() + pyt_y.backward(gradient=grad_data) + + golden_tensor = [in_data.grad] + + comp_pass = compare_results(tt_output_tensor_on_device, golden_tensor) + + assert comp_pass diff --git a/tests/tt_eager/python_api_testing/unit_testing/backward_ops/utility_funcs.py b/tests/tt_eager/python_api_testing/unit_testing/backward_ops/utility_funcs.py index aba1ed576298..b83d435d32db 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/backward_ops/utility_funcs.py +++ b/tests/tt_eager/python_api_testing/unit_testing/backward_ops/utility_funcs.py @@ -52,6 +52,39 @@ def data_gen_with_val(input_shapes, device, required_grad=False, val=1, is_row_m return pt_tensor, tt_tensor +def data_gen_pt_tt_prod(input_shapes, device, all_dimensions, dim, required_grad=False): + torch.manual_seed(213919) + pt_tensor_temp = torch.zeros(input_shapes, requires_grad=required_grad).bfloat16() + shape_Required = torch.Size( + [ + input_shapes[0] if (dim != 0 and dim != -4) else 1, + input_shapes[1] if (dim != 1 and dim != -3) else 1, + input_shapes[2] if (dim != 2 and dim != -2) else 1, + input_shapes[3] if (dim != 3 and dim != -1) else 1, + ] + ) + if all_dimensions == False and (dim == 1 or dim == 0 or dim == -4 or dim == -3): + pt_tensor = torch.randn(shape_Required, requires_grad=required_grad).bfloat16() + tt_tensor = ( + tt_lib.tensor.Tensor(pt_tensor, tt_lib.tensor.DataType.BFLOAT16).to(tt_lib.tensor.Layout.TILE).to(device) + ) + return pt_tensor, tt_tensor + elif all_dimensions == False: + pt_tensor = torch.randn(shape_Required, requires_grad=required_grad).bfloat16() + if dim == 3 or dim == -1: + pt_tensor_temp[:, :, :, :1] = pt_tensor + elif dim == 2 or dim == -2: + pt_tensor_temp[:, :, :1, :] = pt_tensor + else: + shape_Required = torch.Size([1, 1, 1, 1]) + pt_tensor = torch.randn(shape_Required, requires_grad=required_grad).bfloat16() + pt_tensor_temp[:1, :1, :1, :1] = pt_tensor + tt_tensor = ( + tt_lib.tensor.Tensor(pt_tensor_temp, tt_lib.tensor.DataType.BFLOAT16).to(tt_lib.tensor.Layout.TILE).to(device) + ) + return pt_tensor, tt_tensor + + def compare_results(tt_tensor, golden_tensor, pcc=0.99): status = True for i in range(len(tt_tensor)): diff --git a/tests/tt_eager/python_api_testing/unit_testing/test_prod.py b/tests/tt_eager/python_api_testing/unit_testing/test_prod.py deleted file mode 100644 index 84fbabc0ddc3..000000000000 --- a/tests/tt_eager/python_api_testing/unit_testing/test_prod.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import torch -from loguru import logger - -import tt_lib as ttl -from models.utility_functions import comp_allclose_and_pcc - -TILE_HEIGHT = 32 -TILE_WIDTH = 32 - - -def get_tensors(input_shape, output_shape, dims, device): - torch.manual_seed(2023) - npu_dtype = ttl.tensor.DataType.BFLOAT16 - # prod_cpu" not implemented for 'BFloat16' - cpu_dtype = torch.float32 - npu_layout = ttl.tensor.Layout.TILE - - if dims[0] == 3: - # print("dims ==> ", dims) - torch_input = torch.randint(-10, 10, input_shape, dtype=cpu_dtype, requires_grad=True) - torch_input_permute = torch_input.permute(3, 0, 1, 2) - torch_output = torch.randint(-10, 10, output_shape, dtype=cpu_dtype).permute(3, 0, 1, 2) - - print("torch_input shape ==> ", torch_input_permute.shape) - print("torch_output shape ==> ", torch_output.shape) - tt_input = ttl.tensor.Tensor(torch_input_permute, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - tt_output = ttl.tensor.Tensor(torch_output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - else: - torch_input = torch.randint(-100, 100, input_shape, dtype=cpu_dtype, requires_grad=True) - torch_output = torch.randint(-100, 100, output_shape, dtype=cpu_dtype) - - tt_input = ttl.tensor.Tensor(torch_input, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - tt_output = ttl.tensor.Tensor(torch_output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) - - return tt_input, tt_output, torch_input - - -@pytest.mark.parametrize( - "input_shape", - ( - ([4, 5, 6, 2]), - # ([1, 1, TILE_HEIGHT - 1, TILE_WIDTH - 1]), - # ([4, 4, TILE_HEIGHT * 9 - 1, TILE_WIDTH * 12 - 1]), - # ([4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 9 - 1]), - # ([8, 8, TILE_HEIGHT * 4 - 1, TILE_WIDTH * 4 - 1]), - ), - ids=[ - "1, 1, TILE_HEIGHT-1,TILE_WIDTH - 1", - # "1, 2, TILE_HEIGHT-1,TILE_WIDTH - 1", - # "4, 4, TILE_HEIGHT * 9 - 1, TILE_WIDTH * 12 - 1", - # "4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 9 - 1", - # "8, 8, TILE_HEIGHT * 4 - 1, TILE_WIDTH * 4 - 1", - ], -) -@pytest.mark.parametrize( - "dims", - ( - [ - 3, - ], - # [ - # 1, - # ], - ), - ids=[ - "0", - ], -) -def test_moreh_prod_dims(input_shape, dims, device): - output_shape = input_shape.copy() - - for dim in dims: - output_shape[dim] = 1 # [4, 5, 6, 1] - - (tt_input, tt_output, torch_input) = get_tensors(input_shape, output_shape, dims, device) - - torch_output = torch.prod(torch_input, dims[0], True).permute(3, 0, 1, 2) - - if dims[0] == 3: - dims[0] = 0 - output_shape = [1, input_shape[0], input_shape[1], input_shape[2]] - - cpu_layout = ttl.tensor.Layout.ROW_MAJOR - tt_output_cpu = ( - ttl.operations.primary.prod_nc(tt_input, tt_output, dims=dims) - .cpu() - .to(cpu_layout) - .unpad_from_tile(output_shape) - .to_torch() - ) - - print("torch_output ===> ", torch_output, torch_output.shape) - print("tt_output_cpu ===> ", tt_output_cpu, tt_output_cpu.shape) - rtol = atol = 0.1 - passing, output_pcc = comp_allclose_and_pcc(torch_output, tt_output_cpu, pcc=0.999, rtol=rtol, atol=atol) - - logger.info(f"Out passing={passing}") - logger.info(f"Output pcc={output_pcc}") - - assert passing diff --git a/tests/tt_eager/python_api_testing/unit_testing/test_prod_all.py b/tests/tt_eager/python_api_testing/unit_testing/test_prod_all.py new file mode 100644 index 000000000000..2dd609a58328 --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/test_prod_all.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +from loguru import logger +from functools import partial + +import tt_lib as ttl +from models.utility_functions import comp_allclose_and_pcc + +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) + + +def get_tensors(input_shape, output_shape, device): + torch.manual_seed(2023) + npu_dtype = ttl.tensor.DataType.BFLOAT16 + cpu_dtype = torch.bfloat16 + npu_layout = ttl.tensor.Layout.TILE + + torch_input = torch.randint(1, 5, input_shape, dtype=cpu_dtype) + torch_output = torch.randint(1, 5, output_shape, dtype=cpu_dtype) + tt_input = ttl.tensor.Tensor(torch_input, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + tt_output = ttl.tensor.Tensor(torch_output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + + return tt_input, tt_output, torch_input + + +@pytest.mark.parametrize( + "shapes", + ( + ([1, 1, 32, 32]), + ([1, 4, 32, 32]), + ([2, 2, 32, 32]), + # ([6, 4, 32, 32]), #Fails : expected result is inf but the result generated in nan + # ([1, 1, 320, 320]), #Fails : expected result is inf but the result generated in nan + # ([1, 3, 320, 64]), #Fails : expected result is inf but the result generated in nan + ), +) +def test_prod(shapes, device): + output_shape = shapes.copy() + + (tt_input, tt_output, torch_input) = get_tensors(shapes, shapes, device) + + torch_output = torch.prod(torch_input) + + cpu_layout = ttl.tensor.Layout.ROW_MAJOR + tt_output_cpu = ( + ttl.operations.primary.prod_all(tt_input).cpu().to(cpu_layout).unpad_from_tile(output_shape).to_torch() + ) + N, C, H, W = tt_output_cpu.shape + torch.set_printoptions(threshold=10000, precision=5, sci_mode=False) + logger.info("Input shape") + logger.info(torch_input.shape) + logger.info("TT Output") + logger.info(tt_output_cpu[0, 0, 0, 0]) + logger.info("Torch Output") + logger.info(torch_output) + + # test for equivalance + # TODO(Dongjin) : check while changing rtol after enabling fp32_dest_acc_en + rtol = atol = 0.12 + # passing, output_pcc = comp_allclose_and_pcc(torch_output, tt_output_cpu, pcc=0.999, rtol=rtol, atol=atol) + passing, output_pcc = comp_allclose_and_pcc( + torch_output, tt_output_cpu[0, 0, 0, 0], pcc=0.999, rtol=rtol, atol=atol + ) + + logger.info(f"Out passing={passing}") + logger.info(f"Output pcc={output_pcc}") + + assert passing diff --git a/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py new file mode 100644 index 000000000000..c761082e37a6 --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/test_prod_nc.py @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +from loguru import logger + +import tt_lib as ttl +from models.utility_functions import comp_allclose_and_pcc + +TILE_HEIGHT = 32 +TILE_WIDTH = 32 + + +def get_tensors(input_shape, output_shape, device): + torch.manual_seed(2023) + npu_dtype = ttl.tensor.DataType.BFLOAT16 + cpu_dtype = torch.bfloat16 + npu_layout = ttl.tensor.Layout.TILE + + torch_input = torch.randint(-100, 100, input_shape, dtype=cpu_dtype) + torch_output = torch.randint(-100, 100, output_shape, dtype=cpu_dtype) + + tt_input = ttl.tensor.Tensor(torch_input, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + tt_output = ttl.tensor.Tensor(torch_output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device) + + return tt_input, tt_output, torch_input + + +@pytest.mark.parametrize( + "input_shape", + ( + ([2, 3, TILE_HEIGHT * 6 - 1, TILE_WIDTH * 7 - 1]), + ([9, 16, TILE_HEIGHT * 13 - 1, TILE_WIDTH * 19 - 1]), + ([4, 3, TILE_HEIGHT * 3 - 1, TILE_WIDTH * 11 - 1]), + ([1, 1, TILE_HEIGHT - 1, TILE_WIDTH - 1]), + ([4, 4, TILE_HEIGHT * 9 - 1, TILE_WIDTH * 12 - 1]), + ([4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 9 - 1]), + ([8, 8, TILE_HEIGHT * 4 - 1, TILE_WIDTH * 4 - 1]), + ), + ids=[ + "2, 3, TILE_HEIGHT * 6 - 1, TILE_WIDTH * 7 - 1", + "9, 16, TILE_HEIGHT * 13 - 1, TILE_WIDTH * 19 - 1", + "4, 3, TILE_HEIGHT * 9 - 1, TILE_WIDTH * 11 - 1", + "1, 1, TILE_HEIGHT-1,TILE_WIDTH - 1", + "4, 4, TILE_HEIGHT * 9 - 1, TILE_WIDTH * 12 - 1", + "4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 9 - 1", + "8, 8, TILE_HEIGHT * 4 - 1, TILE_WIDTH * 4 - 1", + ], +) +@pytest.mark.parametrize( + "dims", + ( + [ + 0, + ], + [ + 1, + ], + ), + ids=["0", "1"], +) +# Support for dim 2,3 in composite_ops +def test_prod_dims(input_shape, dims, device): + output_shape = input_shape.copy() + + for dim in dims: + output_shape[dim] = 1 + + (tt_input, tt_output, torch_input) = get_tensors(input_shape, output_shape, device) + + torch_output = torch.prod(torch_input, dims[0], True) + + cpu_layout = ttl.tensor.Layout.ROW_MAJOR + tt_output_cpu = ( + ttl.operations.primary.prod_nc(tt_input, tt_output, dims=dims) + .cpu() + .to(cpu_layout) + .unpad_from_tile(output_shape) + .to_torch() + ) + + rtol = atol = 0.1 + passing, output_pcc = comp_allclose_and_pcc(torch_output, tt_output_cpu, pcc=0.999, rtol=rtol, atol=atol) + + logger.info(f"Out passing={passing}") + logger.info(f"Output pcc={output_pcc}") + + assert passing diff --git a/tt_eager/tt_dnn/module.mk b/tt_eager/tt_dnn/module.mk index 478d4b88e834..23c41261b705 100644 --- a/tt_eager/tt_dnn/module.mk +++ b/tt_eager/tt_dnn/module.mk @@ -109,6 +109,8 @@ TT_DNN_SRCS = \ tt_eager/tt_dnn/op_library/moreh_sum/moreh_sum_op.cpp \ tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp \ tt_eager/tt_dnn/op_library/prod/prod_nc_op.cpp \ + tt_eager/tt_dnn/op_library/prod/prod_op_all.cpp \ + tt_eager/tt_dnn/op_library/prod/single_core/prod_op_all_single_core.cpp \ tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward.cpp \ tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward_op.cpp \ tt_eager/tt_dnn/op_library/moreh_mean/moreh_mean_h/moreh_mean_h.cpp \ diff --git a/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp b/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp index 6522ff3e576f..e2f0811945f2 100644 --- a/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp +++ b/tt_eager/tt_dnn/op_library/backward/backward_ops.cpp @@ -13,6 +13,8 @@ #include "tt_dnn/op_library/math.hpp" #include "tt_dnn/op_library/unpad/unpad_op.hpp" #include "tt_dnn/op_library/complex/complex_ops.hpp" +#include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp" +#include "tt_dnn/op_library/permute/permute_op.hpp" namespace tt { @@ -1517,6 +1519,111 @@ std::vector binary_gt_bw(const Tensor& grad, const Tensor& input, const return operation::decorate_as_composite(__func__, _binary_gt_bw)(grad, input, output_mem_config); } +// Prod +// along a single dimension --> result: grad_data * (y / input ) +std::vector _prod_bw( + const Tensor& grad, const Tensor& input, bool all_dimensions, int64_t dim, const MemoryConfig& output_mem_config) { + std::vector grad_tensor; + Tensor prod_result = prod(input, all_dimensions, dim, output_mem_config); + if (all_dimensions == true) { + Tensor temp = mul(prod_result, grad, std::nullopt, output_mem_config); // result is stored in the first position + Tensor fill_tensor = tt::numpy::fill_first_val_into_tensor( temp, temp.get_dtype(), temp.get_layout(), temp.device(), output_mem_config); + Tensor all_dimension_result = mul(recip(input, output_mem_config), fill_tensor, std::nullopt, output_mem_config); + grad_tensor.emplace_back(all_dimension_result); + return grad_tensor; + } + // all_dimensions = False + Tensor updated_grad = prod_result; + if (prod_result.get_legacy_shape() != grad.get_legacy_shape()) { + if (dim == 3 || dim == -1) { + std::vector after_permute_dims = {0, 3, 1, 2}; + Tensor required = permute(grad, after_permute_dims, output_mem_config); + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = { grad.get_legacy_shape()[0] - 1, 0, grad.get_legacy_shape()[1] - 1, grad.get_legacy_shape()[2] - 1}; + Tensor new_unpad_tensor = unpad(required, start_index, end_index); + after_permute_dims = {0, 2, 3, 1}; + updated_grad = permute(new_unpad_tensor, after_permute_dims, output_mem_config); + } else if (dim == 2 || dim == -2) { + std::vector after_permute_dims = {0, 2, 1, 3}; + Tensor required = permute(grad, after_permute_dims, output_mem_config); + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = { grad.get_legacy_shape()[0] - 1, 0, grad.get_legacy_shape()[1] - 1, grad.get_legacy_shape()[3] - 1}; + Tensor new_unpad_tensor = unpad(required, start_index, end_index); + updated_grad = permute(new_unpad_tensor, after_permute_dims, output_mem_config); + } + } + Tensor reciprocal_input = recip(input, output_mem_config); + Tensor temp = mul(prod_result, (dim == 1 || dim == 0 || dim == -4 || dim == -3) ? grad : updated_grad, std::nullopt, output_mem_config); + if (dim == 3 || dim == -1) { + Tensor grad_result = bcast(reciprocal_input, temp, BcastOpMath::MUL, BcastOpDim::W, output_mem_config); + grad_tensor.emplace_back(grad_result); + return grad_tensor; + } else if (dim == 2 || dim == -2) { + Tensor grad_result = bcast(reciprocal_input, temp, BcastOpMath::MUL, BcastOpDim::H, output_mem_config); + grad_tensor.emplace_back(grad_result); + return grad_tensor; + } else if (dim == 1 || dim == -3) { + Tensor tensor_1_temp = reciprocal_input; + if (reciprocal_input.get_legacy_shape()[1] % 32 != 0) { + const Shape start_index = {0, 0, 0, 0}; + const Shape required_shape = { + reciprocal_input.get_legacy_shape()[0], + reciprocal_input.get_legacy_shape()[1] + (32 - (reciprocal_input.get_legacy_shape()[1] % 32)), + reciprocal_input.get_legacy_shape()[2], + reciprocal_input.get_legacy_shape()[3]}; + tensor_1_temp = pad(reciprocal_input, required_shape, start_index, 0); + } + std::vector after_permute_dims = {0, 2, 3, 1}; + Tensor tensor_1 = permute(tensor_1_temp, after_permute_dims, output_mem_config); + Tensor tensor_2 = permute(temp, after_permute_dims, output_mem_config); + after_permute_dims = {0, 3, 1, 2}; + Tensor result = permute( bcast(tensor_1, tensor_2, BcastOpMath::MUL, BcastOpDim::W, output_mem_config), after_permute_dims, output_mem_config); + Tensor grad_result = result; + if (reciprocal_input.get_legacy_shape()[1] % 32 != 0) { + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = { + input.get_legacy_shape()[0] - 1, + input.get_legacy_shape()[1] - 1, + input.get_legacy_shape()[2] - 1, + input.get_legacy_shape()[3] - 1}; + grad_result = unpad(result, start_index, end_index); + } + grad_tensor.emplace_back(grad_result); + return grad_tensor; + } + // dim 0 + Tensor tensor_1_temp = reciprocal_input; + if (reciprocal_input.get_legacy_shape()[0] % 32 != 0) { + const Shape start_index = {0, 0, 0, 0}; + const Shape required_shape = { + reciprocal_input.get_legacy_shape()[0] + (32 - (reciprocal_input.get_legacy_shape()[0] % 32)), + reciprocal_input.get_legacy_shape()[1], + reciprocal_input.get_legacy_shape()[2], + reciprocal_input.get_legacy_shape()[3]}; + tensor_1_temp = pad(reciprocal_input, required_shape, start_index, 0); + } + std::vector after_permute_dims = {3, 1, 2, 0}; + Tensor tensor_1 = permute(tensor_1_temp, after_permute_dims, output_mem_config); + Tensor tensor_2 = permute(temp, after_permute_dims, output_mem_config); + Tensor result = permute( bcast(tensor_1, tensor_2, BcastOpMath::MUL, BcastOpDim::W, output_mem_config), after_permute_dims, output_mem_config); + Tensor grad_result = result; + if (reciprocal_input.get_legacy_shape()[0] % 32 != 0) { + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = { + input.get_legacy_shape()[0] - 1, + input.get_legacy_shape()[1] - 1, + input.get_legacy_shape()[2] - 1, + input.get_legacy_shape()[3] - 1}; + grad_result = unpad(result, start_index, end_index); + } + grad_tensor.emplace_back(grad_result); + return grad_tensor; +} +std::vector prod_bw( + const Tensor& grad, const Tensor& input, bool all_dimensions, int64_t dim, const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _prod_bw)(grad, input, all_dimensions, dim, output_mem_config); +} + // square // result: 2 * input * grad_data std::vector _square_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config) { diff --git a/tt_eager/tt_dnn/op_library/backward/backward_ops.hpp b/tt_eager/tt_dnn/op_library/backward/backward_ops.hpp index a67d32cf05cd..cb1607296c05 100644 --- a/tt_eager/tt_dnn/op_library/backward/backward_ops.hpp +++ b/tt_eager/tt_dnn/op_library/backward/backward_ops.hpp @@ -220,6 +220,8 @@ std::vector frac_bw(const Tensor& grad, const Tensor& input, const Memor std::vector trunc_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +std::vector prod_bw(const Tensor& grad, const Tensor& input, bool all_dimensions, int64_t dim, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + std::vector log_sigmoid_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); std::vector tanhshrink_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp index 3cd2a0336c73..9925b9916bfd 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp @@ -17,6 +17,9 @@ #include "tt_eager/tt_dnn/op_library/pad/pad_op.hpp" #include "tt_numpy/functions.hpp" #include "tt_dnn/op_library/prod/prod_nc_op.hpp" +#include "tt_dnn/op_library/prod/prod_op_all.hpp" +#include "tt_dnn/op_library/permute/permute_op.hpp" +#include "tt_eager/tt_dnn/op_library/unpad/unpad_op.hpp" namespace tt { namespace tt_metal { @@ -902,15 +905,78 @@ Tensor xlogy(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& o return operation::decorate_as_composite(__func__, _xlogy)(input_a, input_b, output_mem_config); } -Tensor _prod(const Tensor& input_a, const MemoryConfig& output_mem_config) { - std::vector dim = {0}; - Shape input_shape = input_a.shape(); - Shape required = { 1, input_shape[1], input_shape[2], input_shape[3]}; - Tensor result = tt::operations::primary::prod_nc(input_a, zeros( required, input_a.dtype(), input_a.layout(), input_a.device(), output_mem_config), dim, output_mem_config); - return result; +Tensor prod_all(const Tensor& input_a, const MemoryConfig& output_mem_config) { + auto formatted_input_tensor = input_a; + if(formatted_input_tensor.get_layout()==Layout::ROW_MAJOR){ + auto a_pad_shape = AutoFormat::pad_to_tile_shape(input_a.get_legacy_shape(), false, false, true, true); + auto out_shape = input_a.get_legacy_shape(); + out_shape = {out_shape[0], out_shape[1], out_shape[2], out_shape[3]}; + if (!AutoFormat::check_input_tensor_format(input_a, a_pad_shape)) { + formatted_input_tensor = AutoFormat::format_input_tensor(input_a, input_a.device(), a_pad_shape, 1.0, Layout::TILE); + } + } + return tt::operations::primary::prod_all(formatted_input_tensor, output_mem_config); +} + +Tensor prod_nc(const Tensor& temp, int64_t dim, const MemoryConfig& output_mem_config) { + //layout conversion + auto formatted_input_tensor = temp; + if(formatted_input_tensor.get_layout()==Layout::ROW_MAJOR){ + auto a_pad_shape = AutoFormat::pad_to_tile_shape(temp.get_legacy_shape(), false, false, true, true); + auto out_shape = temp.get_legacy_shape(); + out_shape = {out_shape[0], out_shape[1], out_shape[2], out_shape[3]}; + if (!AutoFormat::check_input_tensor_format(temp, a_pad_shape)) { + formatted_input_tensor = AutoFormat::format_input_tensor(temp, temp.device(), a_pad_shape, 1.0, Layout::TILE); + } + } + //Apply prod + std::vector dimension = {(dim == 1 || dim == -3) ? 1 : 0}; + Shape input_shape = formatted_input_tensor.get_legacy_shape(); + Shape required = { ((dim == 1 || dim == -3) ? input_shape[0] : 1), ((dim == 1 || dim == -3) ? 1 : input_shape[1]) , input_shape[2], input_shape[3]}; + return tt::operations::primary::prod_nc(formatted_input_tensor, zeros( required, formatted_input_tensor.get_dtype(), formatted_input_tensor.get_layout(), formatted_input_tensor.device(), output_mem_config), dimension, output_mem_config); +} + +Tensor _prod(const Tensor& input_a, bool all_dimensions, int64_t dim, const MemoryConfig& output_mem_config) { + if(all_dimensions){ + return tt::tt_metal::prod_all(input_a, output_mem_config); + } + TT_FATAL(dim >= -4 && dim <= 3 && "Dimension out of range (expected to be in range of [-4, 3]"); + Tensor temp = input_a; + //Permute for dim 2,3 + if(dim == 2 || dim == -2){ + std::vector permute_dims = {2, 0, 1, 3}; + temp = permute(input_a, permute_dims, output_mem_config); + }else if(dim == 3 || dim == -1){ + std::vector permute_dims = {3, 0, 1, 2}; + temp = permute(input_a, permute_dims, output_mem_config); + } + Tensor result = tt::tt_metal::prod_nc(temp, dim, output_mem_config); + //Permute and unpad result for dim 2,3 + if(dim == 0 || dim == 1 || dim == -4 || dim == -3){ + return result; + }else if(dim == 2 || dim == -2){ + std::vector after_permute_dims = {1, 2, 0, 3}; + Tensor required = permute(result, after_permute_dims, output_mem_config); + Shape input_shape = input_a.get_legacy_shape(); + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = {input_shape[0]-1, input_shape[1]-1, 0, input_shape[3]-1}; + return unpad( required, start_index, end_index); + }else{ //dim 3 + //permute + std::vector after_permute_dims = {1, 2, 0, 3}; + Tensor required = permute(result, after_permute_dims, output_mem_config); + //unpad + Shape input_shape = input_a.get_legacy_shape(); + const Shape start_index = {0, 0, 0, 0}; + const Shape end_index = {input_shape[0]-1, input_shape[1]-1, 0, input_shape[2]-1}; + Tensor new_unpad_tensor = unpad( required, start_index, end_index); + //permute back + after_permute_dims = {0, 1, 3, 2}; + return permute(new_unpad_tensor, after_permute_dims, output_mem_config); + } } -Tensor prod(const Tensor& input_a, const MemoryConfig& output_mem_config) { - return operation::decorate_as_composite(__func__, _prod)(input_a, output_mem_config); +Tensor prod(const Tensor& input_a, bool all_dimensions, int64_t dim, const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _prod)(input_a, all_dimensions, dim, output_mem_config); } Tensor _variance_impl( diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp index 463eb928101f..2845ed867155 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp @@ -178,11 +178,14 @@ Tensor logical_noti( float immediate, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); -// prod +//prod Tensor prod( const Tensor& input_a, + bool all_dimensions = false, + int64_t dim = 0, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + /* Returns a new tensor with the signed angles in radians between vectors diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp index f24b3f4ddeb6..623db2e6eddc 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp @@ -170,6 +170,8 @@ std::pair get_op_init_and_func_default(UnaryOpType op_type, stri op_init_and_name = {"sign_tile_init();", fmt::format("sign_tile({});", idst)}; break; case UnaryOpType::SQUARE: op_init_and_name = {"square_tile_init();", fmt::format("square_tile({});", idst)}; break; + case UnaryOpType::TILED_PROD: + op_init_and_name = {"tiled_prod_tile_init();", fmt::format("tiled_prod_tile({});", idst)}; break; case UnaryOpType::EQZ: op_init_and_name = {"eqz_tile_init();", fmt::format("eqz_tile({});", idst)}; break; case UnaryOpType::NEZ: diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index 7ba130714b2e..8437b76c3d67 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -74,7 +74,8 @@ enum class UnaryOpType { SUB_UNARY_SFPU = 56, MUL_UNARY_SFPU = 57, DIV_UNARY_SFPU = 58, - IDENTITY_UINT32 = 59 + IDENTITY_UINT32 = 59, + TILED_PROD = 60 }; template @@ -270,6 +271,7 @@ constexpr auto recip = make_eltwise_unary{}; constexpr auto relu = make_eltwise_unary{}; constexpr auto relu6 = make_eltwise_unary{}; constexpr auto sigmoid = make_eltwise_unary{}; +constexpr auto tiled_prod = make_eltwise_unary{}; constexpr auto log = make_eltwise_unary{}; constexpr auto tanh = make_eltwise_unary{}; constexpr auto log2 = make_eltwise_unary{}; diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_all.cpp b/tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_all.cpp new file mode 100644 index 000000000000..9375737b75ed --- /dev/null +++ b/tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_all.cpp @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" +#include "tt_metal/include/compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" +#include "compute_kernel_api/eltwise_unary/negative.h" + +namespace NAMESPACE { +void MAIN { + + constexpr uint32_t num_tiles = get_compile_time_arg_val(0); + constexpr uint32_t per_core_block_dim = get_compile_time_arg_val(1); + + binary_op_init_common(tt::CB::c_in0, tt::CB::c_intermed0, tt::CB::c_out0); + bool last_tile = false; + bool once = true; + for (uint32_t t = 0; t < num_tiles; t++) { + if ( t == (num_tiles - 1)) + { + last_tile = true; + } + cb_reserve_back(tt::CB::c_out0, 1); + for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) { + cb_wait_front(tt::CB::c_in0, 1); + if (once) + { + cb_reserve_back(tt::CB::c_intermed0, 1); + tile_regs_acquire(); + copy_tile_to_dst_init_short(); + copy_tile(tt::CB::c_in0, 0, 0); // copy from c_in[0] to DST[0] + tile_regs_commit(); + tile_regs_wait(); + if constexpr (num_tiles == 1) + pack_tile(0, tt::CB::c_out0); + else + { + pack_tile(0, tt::CB::c_intermed0); + cb_push_back(tt::CB::c_intermed0, 1); + } + tile_regs_release(); + }else { + tile_regs_acquire(); + mul_tiles_init(); + mul_tiles(tt::CB::c_in0, tt::CB::c_intermed0, 0, 0, 0); + tile_regs_commit(); + tile_regs_wait(); + if (last_tile) + { + pack_tile(0, tt::CB::c_out0); + } + else + { + cb_pop_front(tt::CB::c_intermed0, 1); + cb_reserve_back(tt::CB::c_intermed0, 1); + pack_tile(0, tt::CB::c_intermed0); + cb_push_back(tt::CB::c_intermed0, 1); + } + tile_regs_release(); + } + once = false; + cb_pop_front(tt::CB::c_in0, 1); + } + cb_push_back(tt::CB::c_out0, 1); +} +} +} diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/prod_nc.cpp b/tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_nc.cpp similarity index 92% rename from tt_eager/tt_dnn/op_library/prod/kernels/prod_nc.cpp rename to tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_nc.cpp index dc11fe4946e6..a599e01631ac 100644 --- a/tt_eager/tt_dnn/op_library/prod/kernels/prod_nc.cpp +++ b/tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_nc.cpp @@ -7,9 +7,6 @@ #include "compute_kernel_api/eltwise_binary.h" #include "compute_kernel_api/tile_move_copy.h" -ALWI void ACQ() { acquire_dst(tt::DstMode::Half); } -ALWI void REL() { release_dst(tt::DstMode::Half); } - namespace NAMESPACE { void MAIN { const auto num_input_tiles = get_arg_val(0); @@ -33,14 +30,15 @@ void MAIN { bool last_out = (j == num_input_tiles - 1); uint32_t cb_add = (enable_reload) ? (cb_intermed0) : (cb_in1); - ACQ(); cb_wait_front(cb_in0, onetile); if (enable_reload) { cb_wait_front(cb_intermed0, onetile); } + tile_regs_acquire(); mul_tiles_init(); mul_tiles(cb_in0, cb_add, first_tile, first_tile, dst0); + tile_regs_commit(); cb_pop_front(cb_in0, onetile); if (enable_reload) { @@ -49,9 +47,10 @@ void MAIN { uint32_t cb_out = (last_out) ? (cb_out0) : (cb_intermed0); cb_reserve_back(cb_out, onetile); + tile_regs_wait(); pack_tile(dst0, cb_out); + tile_regs_release(); cb_push_back(cb_out, onetile); - REL(); enable_reload = true; } } diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/reader_prod_nc.cpp b/tt_eager/tt_dnn/op_library/prod/kernels/dataflow/reader_prod_nc.cpp similarity index 58% rename from tt_eager/tt_dnn/op_library/prod/kernels/reader_prod_nc.cpp rename to tt_eager/tt_dnn/op_library/prod/kernels/dataflow/reader_prod_nc.cpp index 27d61028a3e7..3c1fdfaf0f63 100644 --- a/tt_eager/tt_dnn/op_library/prod/kernels/reader_prod_nc.cpp +++ b/tt_eager/tt_dnn/op_library/prod/kernels/dataflow/reader_prod_nc.cpp @@ -5,11 +5,7 @@ #include #include "dataflow_api.h" -#include "tt_eager/tt_dnn/op_library/moreh_sum/kernels/utils.hpp" - -inline uint32_t get_read_tile_id(uint32_t tile_id, uint32_t dim, uint32_t input_tile_offset, uint32_t HtWt) { - return (dim == 0 ) ? (tile_id) : (tile_id / HtWt * input_tile_offset) + (tile_id % HtWt); -} +#include "tt_eager/tt_dnn/kernels/dataflow/moreh_common.hpp" void kernel_main() { const auto input_addr = get_arg_val(0); @@ -17,10 +13,10 @@ void kernel_main() { const auto num_output_tiles = get_arg_val(2); const auto input_tile_offset = get_arg_val(3); const auto start_id = get_arg_val(4); - const auto input_is_dram = (get_arg_val(5) == 1); + const auto input_is_dram = get_compile_time_arg_val(0) == 1; const auto HtWt = get_arg_val(6); const auto CHtWt = get_arg_val(7); - const auto dim = get_arg_val(8); + const auto dim = get_compile_time_arg_val(1); constexpr uint32_t onetile = 1; constexpr uint32_t cb_id_in0 = 0; @@ -36,24 +32,34 @@ void kernel_main() { uint32_t l1_write_addr_in0; uint32_t input_tile_bytes = get_tile_size(cb_id_in0); const auto input_data_format = get_dataformat(cb_id_in0); - const InterleavedAddrGenFast dram_input_addrg = { - .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format}; - const InterleavedAddrGenFast l1_input_addrg = { + const InterleavedAddrGenFast dram_input_addrg = { .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format}; + uint32_t read_tile_id_temp = (dim == 0 ) ? (start_id) : (start_id / HtWt * CHtWt) + (start_id % HtWt); + uint32_t start_tile_id = start_id / HtWt * CHtWt; + uint32_t end_tile_id = start_tile_id + HtWt - 1 ; + uint32_t read_tile_id = read_tile_id_temp; for (uint32_t i = start_id; i < start_id + num_output_tiles; i++) { - auto read_tile_id = get_read_tile_id(i, dim, CHtWt, HtWt); + if constexpr (dim == 0){ + read_tile_id = i; + } for (uint32_t j = 0; j < num_input_tiles; ++j) { cb_reserve_back(cb_id_in0, onetile); l1_write_addr_in0 = get_write_ptr(cb_id_in0); - if (input_is_dram) { - noc_async_read_tile(read_tile_id, dram_input_addrg, l1_write_addr_in0); - } else { - noc_async_read_tile(read_tile_id, l1_input_addrg, l1_write_addr_in0); - } + noc_async_read_tile(read_tile_id, dram_input_addrg, l1_write_addr_in0); noc_async_read_barrier(); cb_push_back(cb_id_in0, onetile); read_tile_id += input_tile_offset; } + if constexpr (dim != 0){ + if(read_tile_id_temp == end_tile_id){ + start_tile_id = start_tile_id + CHtWt; + read_tile_id_temp = start_tile_id; + end_tile_id = read_tile_id_temp + HtWt - 1; + }else{ + read_tile_id_temp = read_tile_id_temp + 1; + } + read_tile_id = read_tile_id_temp; + } } } diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/dataflow/utils.hpp b/tt_eager/tt_dnn/op_library/prod/kernels/dataflow/utils.hpp new file mode 100644 index 000000000000..b9c8ecc95b5f --- /dev/null +++ b/tt_eager/tt_dnn/op_library/prod/kernels/dataflow/utils.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" + +void fill_cb_with_value(uint32_t cb_id, uint32_t value, int32_t num_of_elems = 1024) { + cb_reserve_back(cb_id, 1); + volatile tt_l1_ptr std::uint16_t* ptr = (volatile tt_l1_ptr uint16_t*)(get_write_ptr(cb_id)); + for (int j = 0; j < num_of_elems; j++) { + ptr[j] = uint16_t(value >> 16); + } + cb_push_back(cb_id, 1); +} diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/utils.hpp b/tt_eager/tt_dnn/op_library/prod/kernels/utils.hpp deleted file mode 100644 index c50d0254e341..000000000000 --- a/tt_eager/tt_dnn/op_library/prod/kernels/utils.hpp +++ /dev/null @@ -1,298 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "dataflow_api.h" - -void fill_cb_with_value(uint32_t cb_id, uint32_t value, int32_t num_of_elems = 1024) { - cb_reserve_back(cb_id, 1); - auto ptr = reinterpret_cast(get_write_ptr(cb_id)); - for (int j = 0; j < num_of_elems; j++) { - ptr[j] = uint16_t(value >> 16); - } - cb_push_back(cb_id, 1); -} - -void generate_mask_h_w(uint32_t cb_mask_h_w, uint32_t mask_h, uint32_t mask_w, uint32_t single_tile_size = 2048) { - union { - float f; - uint32_t u; - } one; - one.f = 1.0f; - union { - float f; - uint32_t u; - } zero; - zero.f = 0.0f; - - const auto u16_one = uint16_t(one.u >> 16); - const auto u16_zero = uint16_t(zero.u >> 16); - - cb_reserve_back(cb_mask_h_w, 2); - - // mask_h - // first tile ptr - auto mask_h_ptr = reinterpret_cast(get_write_ptr(cb_mask_h_w)); - for (uint32_t w = 0; w < 16; w++) { - // sub tile 0 - { - uint32_t mask_h_0 = mask_h; - if (mask_h_0 >= 16) { - mask_h_0 = 16; - } - uint32_t h = 0; - for (; h < mask_h_0; h++) { - mask_h_ptr[h * 16 + w] = u16_one; - } - for (; h < 16; h++) { - mask_h_ptr[h * 16 + w] = u16_zero; - } - } - - // sub tile 1 - { - uint32_t mask_h_0 = mask_h; - if (mask_h_0 >= 16) { - mask_h_0 = 16; - } - uint32_t h = 0; - for (; h < mask_h_0; h++) { - mask_h_ptr[h * 16 + w + 256] = u16_one; - } - for (; h < 16; h++) { - mask_h_ptr[h * 16 + w + 256] = u16_zero; - } - } - - // sub tile 2 - { - uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16; - uint32_t h = 0; - for (; h < mask_h_1; h++) { - mask_h_ptr[h * 16 + w + 512] = u16_one; - } - for (; h < 16; h++) { - mask_h_ptr[h * 16 + w + 512] = u16_zero; - } - } - - // sub tile 3 - { - uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16; - uint32_t h = 0; - for (; h < mask_h_1; h++) { - mask_h_ptr[h * 16 + w + 768] = u16_one; - } - for (; h < 16; h++) { - mask_h_ptr[h * 16 + w + 768] = u16_zero; - } - } - } - - // mask_w - // second tile ptr - auto mask_w_ptr = reinterpret_cast(get_write_ptr(cb_mask_h_w) + single_tile_size); - for (uint32_t h = 0; h < 16; h++) { - // sub tile 0 - { - uint32_t mask_w_0 = mask_w; - if (mask_w_0 >= 16) { - mask_w_0 = 16; - } - uint32_t w = 0; - for (; w < mask_w_0; w++) { - mask_w_ptr[h * 16 + w] = u16_one; - } - for (; w < 16; w++) { - mask_w_ptr[h * 16 + w] = u16_zero; - } - } - - // sub tile 1 - { - uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16; - uint32_t w = 0; - for (; w < mask_w_1; w++) { - mask_w_ptr[h * 16 + w + 256] = u16_one; - } - for (; w < 16; w++) { - mask_w_ptr[h * 16 + w + 256] = u16_zero; - } - } - - // sub tile 2 - { - uint32_t mask_w_0 = mask_w; - if (mask_w_0 >= 16) { - mask_w_0 = 16; - } - uint32_t w = 0; - for (; w < mask_w_0; w++) { - mask_w_ptr[h * 16 + w + 512] = u16_one; - } - for (; w < 16; w++) { - mask_w_ptr[h * 16 + w + 512] = u16_zero; - } - } - - // sub tile 3 - { - uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16; - uint32_t w = 0; - for (; w < mask_w_1; w++) { - mask_w_ptr[h * 16 + w + 768] = u16_one; - } - for (; w < 16; w++) { - mask_w_ptr[h * 16 + w + 768] = u16_zero; - } - } - } - - cb_push_back(cb_mask_h_w, 2); -} - -void generate_mask_w(uint32_t cb_mask, uint32_t mask_w) { - union { - float f; - uint32_t u; - } one; - one.f = 1.0f; - union { - float f; - uint32_t u; - } zero; - zero.f = 0.0f; - - cb_reserve_back(cb_mask, 1); - auto ptr = reinterpret_cast(get_write_ptr(cb_mask)); - - for (uint32_t h = 0; h < 16; h++) { - // sub tile 0 - { - uint32_t mask_w_0 = mask_w; - if (mask_w_0 >= 16) - mask_w_0 = 16; - uint32_t w = 0; - for (; w < mask_w_0; w++) { - ptr[h * 16 + w] = uint16_t(one.u >> 16); - } - for (; w < 16; w++) { - ptr[h * 16 + w] = uint16_t(zero.u >> 16); - } - } - - // sub tile 1 - { - uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16; - uint32_t w = 0; - for (; w < mask_w_1; w++) { - ptr[h * 16 + w + 256] = uint16_t(one.u >> 16); - } - for (; w < 16; w++) { - ptr[h * 16 + w + 256] = uint16_t(zero.u >> 16); - } - } - - // sub tile 2 - { - uint32_t mask_w_0 = mask_w; - if (mask_w_0 >= 16) - mask_w_0 = 16; - uint32_t w = 0; - for (; w < mask_w_0; w++) { - ptr[h * 16 + w + 512] = uint16_t(one.u >> 16); - } - for (; w < 16; w++) { - ptr[h * 16 + w + 512] = uint16_t(zero.u >> 16); - } - } - - // sub tile 3 - { - uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16; - uint32_t w = 0; - for (; w < mask_w_1; w++) { - ptr[h * 16 + w + 768] = uint16_t(one.u >> 16); - } - for (; w < 16; w++) { - ptr[h * 16 + w + 768] = uint16_t(zero.u >> 16); - } - } - } - - cb_push_back(cb_mask, 1); -} - -void generate_mask_h(uint32_t cb_mask, uint32_t mask_h) { - union { - float f; - uint32_t u; - } one; - one.f = 1.0f; - union { - float f; - uint32_t u; - } zero; - zero.f = 0.0f; - - cb_reserve_back(cb_mask, 1); - auto ptr = reinterpret_cast(get_write_ptr(cb_mask)); - - for (uint32_t w = 0; w < 16; w++) { - // sub tile 0 - { - uint32_t mask_h_0 = mask_h; - if (mask_h_0 >= 16) - mask_h_0 = 16; - uint32_t h = 0; - for (; h < mask_h_0; h++) { - ptr[h * 16 + w] = uint16_t(one.u >> 16); - } - for (; h < 16; h++) { - ptr[h * 16 + w] = uint16_t(zero.u >> 16); - } - } - - // sub tile 1 - { - uint32_t mask_h_0 = mask_h; - if (mask_h_0 >= 16) - mask_h_0 = 16; - uint32_t h = 0; - for (; h < mask_h_0; h++) { - ptr[h * 16 + w + 256] = uint16_t(one.u >> 16); - } - for (; h < 16; h++) { - ptr[h * 16 + w + 256] = uint16_t(zero.u >> 16); - } - } - - // sub tile 2 - { - uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16; - uint32_t h = 0; - for (; h < mask_h_1; h++) { - ptr[h * 16 + w + 512] = uint16_t(one.u >> 16); - } - for (; h < 16; h++) { - ptr[h * 16 + w + 512] = uint16_t(zero.u >> 16); - } - } - - // sub tile 3 - { - uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16; - uint32_t h = 0; - for (; h < mask_h_1; h++) { - ptr[h * 16 + w + 768] = uint16_t(one.u >> 16); - } - for (; h < 16; h++) { - ptr[h * 16 + w + 768] = uint16_t(zero.u >> 16); - } - } - } - - cb_push_back(cb_mask, 1); -} diff --git a/tt_eager/tt_dnn/op_library/prod/kernels/writer_prod_nc.cpp b/tt_eager/tt_dnn/op_library/prod/kernels/writer_prod_nc.cpp deleted file mode 100644 index 0efd154c76f0..000000000000 --- a/tt_eager/tt_dnn/op_library/prod/kernels/writer_prod_nc.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "dataflow_api.h" - -void kernel_main() { - const auto output_addr = get_arg_val(0); - const auto num_tiles = get_arg_val(1); - const auto start_id = get_arg_val(2); - const auto output_is_dram = (get_arg_val(3) == 1); - - constexpr uint32_t cb_id_out = 16; - constexpr uint32_t onetile = 1; - - uint32_t output_tile_bytes = get_tile_size(cb_id_out); - const auto output_data_format = get_dataformat(cb_id_out); - - const InterleavedAddrGenFast dram_output_addrg = { - .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format}; - const InterleavedAddrGenFast l1_output_addrg = { - .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format}; - - for (uint32_t i = start_id; i < start_id + num_tiles; i++) { - uint32_t write_tile_id = i; - cb_wait_front(cb_id_out, onetile); - - uint32_t l1_read_addr = get_read_ptr(cb_id_out); - if (output_is_dram) { - noc_async_write_tile(write_tile_id, dram_output_addrg, l1_read_addr); - } else { - noc_async_write_tile(write_tile_id, l1_output_addrg, l1_read_addr); - } - noc_async_write_barrier(); - cb_pop_front(cb_id_out, onetile); - } -} diff --git a/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp b/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp index e1f52286a5b8..78781395077d 100644 --- a/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp +++ b/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp @@ -27,10 +27,10 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor //////////////////////////////////////////////////////////////////////////// // Parameters Setup //////////////////////////////////////////////////////////////////////////// - const auto cb_data_format = datatype_to_dataformat_converter(output.dtype()); + const auto cb_data_format = datatype_to_dataformat_converter(output.get_dtype()); const auto single_tile_size = detail::TileSize(cb_data_format); - const auto &input_shape = input.shape(); + const auto &input_shape = input.get_legacy_shape(); const auto &input_shape_without_padding = input_shape.without_padding(); const auto N = input_shape[0]; @@ -88,10 +88,18 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor //////////////////////////////////////////////////////////////////////////// // DataMovementKernel SetUp //////////////////////////////////////////////////////////////////////////// - std::vector reader_compile_time_args; - std::vector writer_compile_time_args; - const auto reader_kernel_file = "tt_eager/tt_dnn/op_library/prod/kernels/reader_prod_nc.cpp"; - const auto writer_kernel_file = "tt_eager/tt_dnn/op_library/prod/kernels/writer_prod_nc.cpp"; + + tt_metal::Buffer *input_buffer_type = input.buffer(); + bool input_is_dram = input_buffer_type->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector reader_compile_time_args = {(std::uint32_t) input_is_dram, static_cast(dim)}; + + tt_metal::Buffer *output_buffer_type = output.buffer(); + constexpr uint32_t cb_id_out = 16; + bool output_is_dram = output_buffer_type->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector writer_compile_time_args = {(std::uint32_t) cb_id_out, (std::uint32_t) output_is_dram}; + + const auto reader_kernel_file = "tt_eager/tt_dnn/op_library/prod/kernels/dataflow/reader_prod_nc.cpp"; + const auto writer_kernel_file = "tt_eager/tt_dnn/kernels/dataflow/writer_unary_interleaved_start_id.cpp"; const auto reader_kernel_id = CreateReadKernel(program, reader_kernel_file, all_cores, reader_compile_time_args); const auto writer_kernel_id = CreateWriteKernel(program, writer_kernel_file, all_cores, writer_compile_time_args); @@ -100,7 +108,7 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor //////////////////////////////////////////////////////////////////////////// const std::vector compute_args_group_1{num_cols_per_core_group_1}; std::map compute_defines; - const auto compute_kernel_file = "tt_eager/tt_dnn/op_library/prod/kernels/prod_nc.cpp"; + const auto compute_kernel_file = "tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_nc.cpp"; const auto compute_kernel_1_id = CreateComputeKernel( program, compute_kernel_file, {core_group_1, num_cols_per_core_group_1, compute_args_group_1}, compute_defines); diff --git a/tt_eager/tt_dnn/op_library/prod/prod_nc_op.cpp b/tt_eager/tt_dnn/op_library/prod/prod_nc_op.cpp index 6e8c8d679644..fc8712975d06 100644 --- a/tt_eager/tt_dnn/op_library/prod/prod_nc_op.cpp +++ b/tt_eager/tt_dnn/op_library/prod/prod_nc_op.cpp @@ -17,15 +17,15 @@ namespace primary { // Prod //////////////////////////////////////////////////////////////////////////// void Prod::validate(const std::vector& inputs) const { - TT_ASSERT((dim >= 0 && dim <= 3), "dim should be 0 - 3"); + TT_FATAL((dim >= 0 && dim <= 3), "dim should be 0 - 3"); const auto& input = inputs.at(0); const auto& output = inputs.at(1); - auto input_shape = input.shape(); - TT_ASSERT((input_shape.rank()), "rank should be 4"); - const auto& output_shape = output.shape(); - auto input_shape_wo_padding = input.shape().without_padding(); - const auto& output_shape_wo_padding = output.shape().without_padding(); + auto input_shape = input.get_legacy_shape(); + TT_FATAL((input_shape.rank() == 4), "rank should be 4"); + const auto& output_shape = output.get_legacy_shape(); + auto input_shape_wo_padding = input.get_legacy_shape().without_padding(); + const auto& output_shape_wo_padding = output.get_legacy_shape().without_padding(); if (dim == 0 || dim == 1) { input_shape[dim] = 1; @@ -33,10 +33,8 @@ void Prod::validate(const std::vector& inputs) const { } for (int i = 0; i < input_shape.rank(); ++i) { - std::cout<<"input_shape[i] ===> "<< input_shape[i]< "<< output_shape[i]<& dims, - const MemoryConfig& mem_config) { + const MemoryConfig& output_mem_config) { // reduce for all dims if (dims.empty()) { dims = {0, 1, 2, 3}; @@ -110,7 +108,7 @@ Tensor prod_nc( auto temp_input = input; for (uint32_t i = dims.size() - 1; i > 0; i--) { log_debug(LogTest, "{}:{} dim {}", __func__, __LINE__, sorted_dims[i]); - auto temp_output = prod_(temp_input, sorted_dims[i], mem_config); + auto temp_output = prod_(temp_input, sorted_dims[i], output_mem_config); temp_input = temp_output; } log_debug(LogTest, "{}:{} dim {}", __func__, __LINE__, sorted_dims.front()); diff --git a/tt_eager/tt_dnn/op_library/prod/prod_nc_op.hpp b/tt_eager/tt_dnn/op_library/prod/prod_nc_op.hpp index 61fe8353b443..3d879c3071b4 100644 --- a/tt_eager/tt_dnn/op_library/prod/prod_nc_op.hpp +++ b/tt_eager/tt_dnn/op_library/prod/prod_nc_op.hpp @@ -44,7 +44,7 @@ Tensor prod_nc( const Tensor &input, const Tensor &output, std::vector &dims, - const MemoryConfig &mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + const MemoryConfig &output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); } // namespace primary diff --git a/tt_eager/tt_dnn/op_library/prod/prod_op_all.cpp b/tt_eager/tt_dnn/op_library/prod/prod_op_all.cpp new file mode 100644 index 000000000000..385321f1431f --- /dev/null +++ b/tt_eager/tt_dnn/op_library/prod/prod_op_all.cpp @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_dnn/op_library/prod/prod_op_all.hpp" +#include "tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp" + +#include +#include + +#include "tt_metal/common/constants.hpp" +#include +#include "tt_metal/host_api.hpp" +#include "tt_metal/tools/profiler/op_profiler.hpp" + +namespace tt { +using namespace constants; +namespace operations { +namespace primary { + +void Prod_op::validate(const std::vector& input_tensors) const { + const auto& input_tensor_a = input_tensors.at(0); + TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operands need to be on device!"); + TT_FATAL(input_tensor_a.buffer() != nullptr , "Operands need to be allocated in buffers on device!"); + TT_FATAL((input_tensor_a.get_layout() == Layout::TILE), "Input Layout must be tilized"); + TT_FATAL(input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED); + TT_FATAL(input_tensor_a.get_dtype() == DataType::BFLOAT16); +} + +std::vector Prod_op::compute_output_shapes(const std::vector& input_tensors) const { + const auto& input_tensor = input_tensors.at(0); + return {input_tensor.get_legacy_shape()}; +} + +std::vector Prod_op::create_output_tensors(const std::vector& input_tensors) const { +const auto& input_tensor = input_tensors.at(0); + return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config); +} + +operation::ProgramWithCallbacks Prod_op::create_program( + const std::vector& input_tensors, std::vector& output_tensors) const { + const auto& input_tensor_a = input_tensors.at(0); + auto& output_tensor = output_tensors.at(0); + return prod_single_core(input_tensor_a, output_tensor); +} + +Tensor prod_all(const Tensor& input, const MemoryConfig& output_mem_config ) { + Tensor result = tiled_prod( operation::run(Prod_op{.output_mem_config = output_mem_config}, {input}).at(0), output_mem_config); + auto arch_env = detect_arch(); + if(arch_env == tt::ARCH::WORMHOLE_B0){ + return tt::numpy::prod_result_computation_WH_B0(result, result.get_dtype(), result.get_layout(), result.device(), output_mem_config); + } + //else --> GS Arch + return tt::numpy::prod_result_computation_GS(result, result.get_dtype(), result.get_layout(), result.device(), output_mem_config); + return operation::run(Prod_op{.output_mem_config = output_mem_config}, {input}).at(0); +} + +} +} +} diff --git a/tt_eager/tt_dnn/op_library/prod/prod_op_all.hpp b/tt_eager/tt_dnn/op_library/prod/prod_op_all.hpp new file mode 100644 index 000000000000..3fa1c497174f --- /dev/null +++ b/tt_eager/tt_dnn/op_library/prod/prod_op_all.hpp @@ -0,0 +1,45 @@ +/* + * SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once +#include "tensor/tensor.hpp" +#include "tt_dnn/op_library/run_operation.hpp" + +namespace tt { + +namespace operations{ + +namespace primary{ + +/* + * prod product + */ + +struct Prod_op { + const MemoryConfig output_mem_config; + const DataType output_dtype; // TODO: Uplift output_dtype as an option for general dot/bmm + void validate(const std::vector &input_tensors) const; + std::vector compute_output_shapes(const std::vector &input_tensors) const; + std::vector create_output_tensors(const std::vector &input_tensors) const; + operation::ProgramWithCallbacks create_program( + const std::vector &input_tensors, std::vector &output_tensors) const; + static constexpr auto attribute_names = + std::make_tuple("output_mem_config", "output_dtype"); + const auto attribute_values() const { + return std::make_tuple( + std::cref(this->output_mem_config), std::cref(this->output_dtype)); + } +}; + +operation::ProgramWithCallbacks prod_single_core(const Tensor &input_tensor_a, const Tensor &output_tensor); + +Tensor prod_all( + const Tensor &input, + const MemoryConfig &mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +} + +} +} diff --git a/tt_eager/tt_dnn/op_library/prod/single_core/prod_op_all_single_core.cpp b/tt_eager/tt_dnn/op_library/prod/single_core/prod_op_all_single_core.cpp new file mode 100644 index 000000000000..0fdd6b1e3087 --- /dev/null +++ b/tt_eager/tt_dnn/op_library/prod/single_core/prod_op_all_single_core.cpp @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_dnn/op_library/prod/prod_op_all.hpp" +#include "tt_metal/common/constants.hpp" +#include "tt_metal/detail/util.hpp" +#include "tt_metal/host_api.hpp" + + +namespace tt { +using namespace constants; +namespace operations { +namespace primary { + + operation::ProgramWithCallbacks prod_single_core(const Tensor &a, const Tensor& output) + { + + Program program{}; + + CoreRange core = {.start={0, 0}, .end={0, 0}}; + + tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); + uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format); + + uint32_t num_tiles = a.volume() / TILE_HW; + + // This should allocate a DRAM buffer on the device + tt_metal::Device *device = a.device(); + + uint32_t src0_cb_index = 0; + uint32_t num_input_tiles = 2; + tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); + + tt_metal::CircularBufferConfig cb_inter_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{tt::CB::c_intermed0, cb_data_format}}) + .set_page_size(tt::CB::c_intermed0, single_tile_size); + auto cb_interm = tt_metal::CreateCircularBuffer(program, core, cb_inter_config); + + uint32_t output_cb_index = 16; // output operands start at index 16 + uint32_t num_output_tiles = 2; + tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}}) + .set_page_size(output_cb_index, single_tile_size); + auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); + + auto src_buffer = a.buffer(); + auto dst_buffer = output.buffer(); + + bool src_is_dram = src_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector reader_compile_time_args = {(uint32_t)src_is_dram}; + bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector writer_compile_time_args = { + (std::uint32_t) output_cb_index, + (std::uint32_t) dst_is_dram + }; + + tt_metal::KernelHandle unary_reader_kernel_id = tt_metal::CreateKernel( + program, + "tt_eager/tt_dnn/kernels/dataflow/reader_unary_interleaved_start_id.cpp", + core, + tt_metal::ReaderDataMovementConfig{.compile_args=reader_compile_time_args}); + + tt_metal::KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel( + program, + "tt_eager/tt_dnn/kernels/dataflow/writer_unary_interleaved_start_id.cpp", + core, + tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args}); + + vector compute_kernel_args = { + num_tiles, // per_core_block_cnt + 1 // per_core_block_size + }; + + bool fp32_dest_acc_en = false; + bool math_approx_mode = true; + auto eltwise_unary_kernel_id = tt_metal::CreateKernel( + program, + "tt_eager/tt_dnn/op_library/prod/kernels/compute/prod_all.cpp", + core, + tt_metal::ComputeConfig{ + .math_fidelity = MathFidelity::HiFi4, + .fp32_dest_acc_en = fp32_dest_acc_en, + .math_approx_mode = math_approx_mode, + .compile_args = compute_kernel_args + } + ); + + SetRuntimeArgs( + program, + unary_reader_kernel_id, + core, + { + src_buffer->address(), + num_tiles, 0 + } + ); + + SetRuntimeArgs( + program, + unary_writer_kernel_id, + core, + { + dst_buffer->address(), + num_tiles, 0 + } + ); + + auto override_runtime_args_callback = [unary_reader_kernel_id, unary_writer_kernel_id]( + const Program &program, + const std::vector& input_buffers, + const std::vector& output_buffers + ) { + + auto src_buffer = input_buffers.at(0); + + auto dst_buffer = output_buffers.at(0); + + CoreCoord core = {0, 0}; + + { + auto &runtime_args = GetRuntimeArgs(program, unary_reader_kernel_id, core); + runtime_args[0] = src_buffer->address(); + } + + { + auto &runtime_args = GetRuntimeArgs(program, unary_writer_kernel_id, core); + runtime_args[0] = dst_buffer->address(); + } + }; + + return {std::move(program), override_runtime_args_callback}; +} + +} +} +} diff --git a/tt_eager/tt_lib/csrc/operations/primary/module.hpp b/tt_eager/tt_lib/csrc/operations/primary/module.hpp index fc0c59906f62..1fb5d82606d5 100644 --- a/tt_eager/tt_lib/csrc/operations/primary/module.hpp +++ b/tt_eager/tt_lib/csrc/operations/primary/module.hpp @@ -40,6 +40,7 @@ #include "tt_dnn/op_library/moreh_mean/moreh_mean_op.hpp" #include "tt_dnn/op_library/moreh_mean_backward/moreh_mean_backward_op.hpp" #include "tt_dnn/op_library/moreh_getitem/moreh_getitem_op.hpp" +#include "tt_dnn/op_library/prod/prod_op_all.hpp" namespace py = pybind11; @@ -480,6 +481,14 @@ void py_module(py::module& m_primary) { Performs a rmsnorm(a+b)*gamma + beta operation. )doc"); + //prod along all dimensions + m_primary.def( + "prod_all", + &prod_all, + py::arg("input").noconvert(), + py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + "Computes prod along along dimensions of the tensor."); + // moreh_adam m_primary.def( "moreh_adam", diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_backward_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_backward_ops.cpp index bde7cf041550..3288535cb62c 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_backward_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_backward_ops.cpp @@ -1699,6 +1699,25 @@ namespace tt::tt_metal::detail{ "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" )doc"); + m_tensor.def("prod_bw", &tt::tt_metal::prod_bw, + py::arg("grad").noconvert(), py::arg("input").noconvert(), py::arg("all_dimensions") , py::arg("dim") , py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Performs backward operations for prod on ``input_a`` along ``all_dimensions`` or a particular ``dim``. + If ``all_dimensions`` is set to ``true``, irrespective of given dimension it will perform backward prod for all dimensions. + + Input tensor must have BFLOAT16 data type. + + Output tensors will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "grad", "Gradient tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "input", "Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "all_dimensions", "Consider all dimension (ignores ``dim`` param)", "bool", "", "Yes" + "dim", "Dimension to perform prod", "int", "", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + m_tensor.def("log_sigmoid_bw", &tt::tt_metal::log_sigmoid_bw, py::arg("grad").noconvert(), py::arg("input").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( Performs backward operations for log_sigmoid ``input`` tensors with given ``grad``. diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp index 6a86f442a114..f16bf681ae84 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp @@ -156,7 +156,6 @@ namespace tt::tt_metal::detail{ detail::bind_unary_op(m_tensor, "swish", swish, R"doc(Returns tensor with the swish all of elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "mish", &mish, R"doc(Returns tensor with the mish activation of elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "cbrt", &cbrt, R"doc(Returns tensor with the cbrt activation of elements of the input tensor ``{0}``.)doc"); - detail::bind_unary_op(m_tensor, "prod", &prod, R"doc(Computes the prod function along all dimensions on the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "asinh", &asinh, R"doc(Returns tensor with the inverse hyperbolic sine of elements of the input tensor ``{0}`` in range [-1e-6, 1e6]. for +input , output = asinh(input) for -input , output = -asinh(input))doc" @@ -208,6 +207,23 @@ namespace tt::tt_metal::detail{ R"doc(Applies the Gated Linear Units (GLU) function to the elements of the input tensor ``{0}`` split along dim ``{1}``.)doc", R"doc(dimension to split)doc" ); + m_tensor.def("prod", &prod, + py::arg("input").noconvert(), py::arg("all_dimensions") = false, py::arg("dim") = 0, py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Computes the prod function along specified ``dim`` or all dimensions on the ``input`` tensor. + If ``all_dimensions`` is set to ``true`` irrespective of given dimension it will prod along all dimensions. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "input", "Tensor prod is applied to", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "all_dimensions", "Consider all dimension (ignores ``dim`` param)", "bool", "default to false", "No" + "dim", "Dimension to perform prod", "int", "default to 0", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); detail::bind_unary_op_with_param( m_tensor, "geglu", &geglu, py::arg("dim") = -1, diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 433d8ca0f589..0ce5217f5423 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -58,6 +58,7 @@ namespace tt::tt_metal::detail { detail::bind_unary_op(m_tensor, "isfinite", isfinite, R"doc(Returns boolean tensor that is True where input tensor ``{0}``, is finite and False elsewhere.)doc"); detail::bind_unary_op(m_tensor, "isinf", isinf, R"doc(Returns boolean tensor that is True where input tensor ``{0}``, is infinite and False elsewhere.)doc"); detail::bind_unary_op(m_tensor, "isposinf", isposinf, R"doc(Returns each element of input tensor ``{0}``, is positive infinity or not.)doc"); + detail::bind_unary_op(m_tensor, "tiled_prod", tiled_prod, R"doc(Performs tile-wise multiplication on input tensor ``{0}`` and store the result in the last tile of the input tensor.)doc"); detail::bind_unary_op(m_tensor, "isneginf", isneginf, R"doc(Returns each element of input tensor ``{0}``, is negative infinity or not.)doc"); detail::bind_unary_op(m_tensor, "isnan", isnan, R"doc(Returns boolean tensor that is True where tensor ``{0}``, is NaN and False elsewhere.)doc"); detail::bind_unary_op(m_tensor, "sign", sign, R"doc(Returns tensor with the elementwise signum of the input tensor ``{0}``.)doc"); diff --git a/tt_eager/tt_numpy/functions.hpp b/tt_eager/tt_numpy/functions.hpp index 66c30ab4a7f6..3e0dd5fc0679 100644 --- a/tt_eager/tt_numpy/functions.hpp +++ b/tt_eager/tt_numpy/functions.hpp @@ -354,6 +354,120 @@ static Tensor index_all( return output; } +template +static Tensor fill_first_val_into_tensor(const Tensor& input_tensor, DataType data_type, + const Layout layout , Device * device = nullptr, + const MemoryConfig& output_mem_config = MemoryConfig{.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { + const Shape& s_a = input_tensor.get_legacy_shape(); + auto owned_buffer = tt_metal::owned_buffer::create(tt_metal::compute_volume(s_a)); //ouput + auto device_buffer = input_tensor.device_buffer(); + uint32_t size_in_bytes = device_buffer->size(); + vector data_vec; + const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { + data_vec.resize(size_in_bytes / sizeof(T)); + tt::tt_metal::tensor_impl::read_data_from_device_buffer(input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true); + } else { + tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); + } + auto input_buffer = owned_buffer::create(std::move(data_vec)); + const Shape input_tensor_strides = input_tensor.strides(); + for(uint32_t i = 0; i < tt_metal::compute_volume(s_a); i++) { + owned_buffer[i] = input_buffer[0]; + } + auto output = Tensor(OwnedStorage{owned_buffer}, s_a, data_type, layout).to(layout); + if (device != nullptr) { + output = output.to(device, output_mem_config); + } + return output; +} + +template +static Tensor prod_result_computation_GS(const Tensor& input_tensor, DataType data_type, + const Layout layout , Device * device = nullptr, + const MemoryConfig& output_mem_config = MemoryConfig{.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { + const Shape& s_a = input_tensor.get_legacy_shape(); + auto owned_buffer = tt_metal::owned_buffer::create(tt_metal::compute_volume(s_a)); //ouput + auto device_buffer = input_tensor.device_buffer(); + uint32_t size_in_bytes = device_buffer->size(); + vector data_vec; + const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { + data_vec.resize(size_in_bytes / sizeof(T)); + tt::tt_metal::tensor_impl::read_data_from_device_buffer(input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true); + } else { + tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); + } + auto input_buffer = owned_buffer::create(std::move(data_vec)); + const Shape input_tensor_strides = input_tensor.strides(); + auto result = static_cast(1.0f); + for(uint32_t i = s_a[0]-1; i < s_a[0]; i++) { + for(int32_t j = s_a[1]-1; j < s_a[1]; j++) { + for(int32_t k = s_a[2]-32; k < s_a[2]; k++) { //access last tile + for(int32_t l = s_a[3]-32; l < s_a[3]; l++) { + auto input_index = l + input_tensor_strides[2] * k + input_tensor_strides[1] * j + input_tensor_strides[0] * i; + if(k>=s_a[2]-2 && l>=s_a[3]-32){ //to access 2*32 in TILE layout + result = result * static_cast(input_buffer[input_index]); + owned_buffer[input_index] = static_cast(0.0f); + }else{ + owned_buffer[input_index] = static_cast(0.0f); + } + } + } + } + } + owned_buffer[0] = result; //store the result at the first position of the tensor,and the rest of the values as 0.0f + auto output = Tensor(OwnedStorage{owned_buffer}, s_a, data_type, layout).to(layout); + if (device != nullptr) { + output = output.to(device, output_mem_config); + } + return output; +} + +template +static Tensor prod_result_computation_WH_B0(const Tensor& input_tensor, DataType data_type, + const Layout layout , Device * device = nullptr, + const MemoryConfig& output_mem_config = MemoryConfig{.memory_layout=tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { + const Shape& s_a = input_tensor.get_legacy_shape(); + auto owned_buffer = tt_metal::owned_buffer::create(tt_metal::compute_volume(s_a)); //ouput + auto device_buffer = input_tensor.device_buffer(); + uint32_t size_in_bytes = device_buffer->size(); + vector data_vec; + const char *TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { + data_vec.resize(size_in_bytes / sizeof(T)); + tt::tt_metal::tensor_impl::read_data_from_device_buffer(input_tensor.device()->command_queue(), device_buffer, data_vec.data(), true); + } else { + tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); + } + auto input_buffer = owned_buffer::create(std::move(data_vec)); + const Shape input_tensor_strides = input_tensor.strides(); + auto result = static_cast(1.0f); + // need to access the last 4 rows and alternating columns of index 17 ,19, 21, 23, 25, 27, 29, 31 + for(uint32_t i = s_a[0]-1; i < s_a[0]; i++) { + for(int32_t j = s_a[1]-1; j < s_a[1]; j++) { + for(int32_t k = s_a[2]-32; k < s_a[2]; k++) { //access last tile + for(int32_t l = s_a[3]-32; l < s_a[3]; l++) { + auto input_index = l + input_tensor_strides[2] * k + input_tensor_strides[1] * j + input_tensor_strides[0] * i; + if(k>=s_a[2]-4 && (l==s_a[3]-15 || l==s_a[3]-13 || l==s_a[3]-11 || l==s_a[3]-9 || l==s_a[3]-7 || l==s_a[3]-5 || l==s_a[3]-3 || l==s_a[3]-1)){ //to access 4*16 elements placed alternatively starting from index 17W in TILE layout + result = result * static_cast(input_buffer[input_index]); + owned_buffer[input_index] = static_cast(0.0f); + }else{ + owned_buffer[input_index] = static_cast(0.0f); + } + } + } + } + } + owned_buffer[0] = result; //store the result at the first position of the tensor,and the rest of the values as 0.0f + auto output = Tensor(OwnedStorage{owned_buffer}, s_a, data_type, layout).to(layout); + if (device != nullptr) { + output = output.to(device, output_mem_config); + } + return output; +} + + template static Tensor index_channel( const Shape& shape, diff --git a/tt_metal/common/bfloat16.hpp b/tt_metal/common/bfloat16.hpp index f6dde378b1c8..a6c58ef16256 100644 --- a/tt_metal/common/bfloat16.hpp +++ b/tt_metal/common/bfloat16.hpp @@ -70,6 +70,9 @@ class bfloat16 { bool operator!=(const bfloat16 rhs) const { return not (*this == rhs); } + bfloat16 operator*(const bfloat16 rhs) const { + return bfloat16(this->to_float() * rhs.to_float()); + } }; inline ostream& operator<<(ostream& os, const bfloat16& bfp16) diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h index 8162c89d5e72..501b7a307a23 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_unary_sfpu_api.h @@ -11,6 +11,7 @@ #include "llk_math_eltwise_unary_sfpu_heaviside.h" #include "llk_math_eltwise_unary_sfpu_power.h" #include "llk_math_eltwise_unary_sfpu_rsqrt.h" +#include "llk_math_eltwise_unary_sfpu_tiled_prod.h" #include "llk_math_eltwise_unary_sfpu_sigmoid.h" #include "llk_math_eltwise_unary_sfpu_sign.h" #include "llk_math_eltwise_unary_sfpu_signbit.h" diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h new file mode 100644 index 000000000000..de91583c898b --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_tiled_prod() +{ + vFloat result = 1.0f; + #pragma GCC unroll 0 + for (int d = 0; d < ITERATIONS; d++) { + vFloat v = dst_reg[0]; + result *= v; + dst_reg[0] = result; + dst_reg++; + } + vFloat v = dst_reg[0]; + result *= v; + dst_reg[0] = result; + dst_reg++; +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h new file mode 100644 index 000000000000..929a71a8815c --- /dev/null +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_tiled_prod.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_tiled_prod_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_tiled_prod(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_tiled_prod, + ckernel::sfpu::calculate_tiled_prod, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h index bf23a084b6df..5e7a3526ae2c 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu_types.h @@ -60,5 +60,6 @@ enum SfpuType { silu, mask, negative, + tiled_prod, unused, }; diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 23972fb69f9e..74f6d1a0a310 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -14,6 +14,7 @@ #include "llk_math_eltwise_unary_sfpu_max.h" #include "llk_math_eltwise_unary_sfpu_power.h" #include "llk_math_eltwise_unary_sfpu_rsqrt.h" +#include "llk_math_eltwise_unary_sfpu_tiled_prod.h" #include "llk_math_eltwise_unary_sfpu_sigmoid.h" #include "llk_math_eltwise_unary_sfpu_sign.h" #include "llk_math_eltwise_unary_sfpu_signbit.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h new file mode 100644 index 000000000000..2e1fb1201c5e --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_tiled_prod.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_tiled_prod() +{ + vFloat result = 1.0f; + #pragma GCC unroll 8 + for (int d = 0; d < ITERATIONS; d++) { + vFloat v = dst_reg[0]; + result *= v; + dst_reg[0] = result; + dst_reg++; + } + vFloat v = dst_reg[0]; + result *= v; + dst_reg[0] = result; + dst_reg++; +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h new file mode 100644 index 000000000000..3891d688bdaa --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_tiled_prod.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_0_param.h" +#include "ckernel_sfpu_tiled_prod.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_tiled_prod_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_tiled_prod(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_0_param + (ckernel::sfpu::calculate_tiled_prod, + ckernel::sfpu::calculate_tiled_prod, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h index e2105c90b676..83f94fa716df 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h @@ -70,5 +70,6 @@ enum SfpuType { topk_local_sort, topk_merge, topk_rebuild, + tiled_prod, unused, }; diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h index b1ea83c175cc..d2d3196d0847 100644 --- a/tt_metal/include/compute_kernel_api.h +++ b/tt_metal/include/compute_kernel_api.h @@ -357,6 +357,30 @@ ALWI void lez_tile_init() { MATH(( llk_math_eltwise_unary_sfpu_lez_init() )); } +/** + * Performs element-wise multiplication on each row of a tile. + * The DST register buffer must be in + * acquired state via *acquire_dst* call. This call is blocking and is only + * available on the compute engine. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst | The index of the tile in DST register buffer to perform the computation on | uint32_t | Must be less than the size of the DST register buffer | True | + */ +ALWI void tiled_prod_tile(uint32_t idst) { + MATH(( llk_math_eltwise_unary_sfpu_tiled_prod(idst) )); +} + +/** + * Please refer to documentation for any_init. + */ +ALWI void tiled_prod_tile_init() { + MATH(( llk_math_eltwise_unary_sfpu_tiled_prod_init() )); +} + + /** * Will store in the output of the compute core True if each element is greater than zero. * The DST register buffer must be in acquired state via *acquire_dst* call.