diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst index 00b705f74dc..ab2cacd3c24 100644 --- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst +++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst @@ -274,6 +274,10 @@ Tensor elementwise operations .. autofunction:: tt_lib.tensor.div +.. autofunction:: tt_lib.tensor.div_trunc + +.. autofunction:: tt_lib.tensor.unary_rdiv_trunc + .. autofunction:: tt_lib.tensor.div_no_nan .. autofunction:: tt_lib.tensor.add_unary @@ -454,12 +458,18 @@ Tensor elementwise operations .. autofunction:: tt_lib.tensor.floor +.. autofunction:: tt_lib.tensor.ceil + .. autofunction:: tt_lib.tensor.trunc +.. autofunction:: tt_lib.tensor.frac + .. autofunction:: tt_lib.tensor.round .. autofunction:: tt_lib.tensor.floor_div +.. autofunction:: tt_lib.tensor.rfloor_div + Tensor relational operations ============================ .. autofunction:: tt_lib.tensor.gtz diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py index 8f53354af61..7797eacc0de 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py @@ -268,6 +268,18 @@ "tt_op": tt_lib_ops.eltwise_div, "pytorch_op": pytorch_ops.div, }, + "eltwise-div_trunc": { + "tt_op": tt_lib_ops.eltwise_div_trunc, + "pytorch_op": pytorch_ops.div_trunc, + }, + "eltwise-unary_div_trunc": { + "tt_op": tt_lib_ops.eltwise_unary_div_trunc, + "pytorch_op": pytorch_ops.unary_div_trunc, + }, + "eltwise-unary_rdiv_trunc": { + "tt_op": tt_lib_ops.eltwise_unary_rdiv_trunc, + "pytorch_op": pytorch_ops.unary_rdiv_trunc, + }, "eltwise-div_no_nan": { "tt_op": tt_lib_ops.eltwise_div_no_nan, "pytorch_op": pytorch_ops.div_no_nan, @@ -620,10 +632,18 @@ "tt_op": tt_lib_ops.eltwise_floor, "pytorch_op": pytorch_ops.floor, }, + "eltwise-ceil": { + "tt_op": tt_lib_ops.eltwise_ceil, + "pytorch_op": pytorch_ops.ceil, + }, "eltwise-trunc": { "tt_op": tt_lib_ops.eltwise_trunc, "pytorch_op": pytorch_ops.trunc, }, + "eltwise-frac": { + "tt_op": tt_lib_ops.eltwise_frac, + "pytorch_op": pytorch_ops.frac, + }, "eltwise-floor_div": { "tt_op": tt_lib_ops.eltwise_floor_div, "pytorch_op": pytorch_ops.floor_div, @@ -632,6 +652,10 @@ "tt_op": tt_lib_ops.eltwise_unary_floor_div, "pytorch_op": pytorch_ops.unary_floor_div, }, + "eltwise-rfloor_div": { + "tt_op": tt_lib_ops.eltwise_rfloor_div, + "pytorch_op": pytorch_ops.rfloor_div, + }, "eltwise-round": { "tt_op": tt_lib_ops.eltwise_round, "pytorch_op": pytorch_ops.round, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py new file mode 100644 index 00000000000..21d8f3609c0 --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +import random +from functools import partial +import tt_lib as ttl + + +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) +from models.utility_functions import skip_for_grayskull + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32], [1, 1, 32, 32]], + [[1, 1, 320, 384], [1, 1, 320, 384]], + [[1, 3, 320, 384], [1, 3, 320, 384]], + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +@skip_for_grayskull("#ToDo: GS implementation needs to be done for floor") +class TestDivTrunc: + def test_run_div_trunc( + self, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "eltwise-div_trunc", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py index 6b6d6e6a4aa..9bdf8d5e182 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py @@ -585,7 +585,7 @@ def test_run_eltwise_sign_ops( test_args, ) - @pytest.mark.parametrize("round_off_method", ["floor", "trunc"]) + @pytest.mark.parametrize("round_off_method", ["floor", "ceil", "trunc"]) @skip_for_grayskull("#ToDo: GS implementation needs to be done for Floor") def test_run_eltwise_round_off_ops( self, @@ -597,9 +597,7 @@ def test_run_eltwise_round_off_ops( output_mem_config, ): datagen_func = [ - generation_funcs.gen_func_with_cast( - partial(generation_funcs.gen_rand, low=-1000, high=1000), torch.bfloat16 - ) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] test_args.update( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py new file mode 100644 index 00000000000..083fabc783f --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch +import random +from functools import partial +import tt_lib as ttl +from models.utility_functions import skip_for_grayskull + +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32]], + [[1, 1, 320, 384]], + [[1, 3, 320, 384]], + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +@skip_for_grayskull("#ToDo: GS implementation needs to be done") +class TestFrac: + def test_run_frac( + self, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "eltwise-frac", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py new file mode 100644 index 00000000000..cb05219d1fb --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch +import random +import numpy as np +from functools import partial +import tt_lib as ttl +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) +from models.utility_functions import skip_for_grayskull + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32], [1, 1, 32, 32]], + [[1, 1, 320, 384], [1, 1, 320, 384]], + [[1, 3, 320, 384], [1, 3, 320, 384]], + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +@skip_for_grayskull("#ToDo: GS implementation needs to be done for floor") +class TestRfloor_div: + def test_run_rfloor_div( + self, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "eltwise-rfloor_div", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py new file mode 100644 index 00000000000..f93da898418 --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch +import random +import numpy as np +from functools import partial +import tt_lib as ttl +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) +from models.utility_functions import skip_for_grayskull + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32], [1, 1, 32, 32]], + [[1, 1, 320, 384], [1, 1, 320, 384]], + [[1, 3, 320, 384], [1, 3, 320, 384]], + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +@skip_for_grayskull("#ToDo: GS implementation needs to be done for floor") +class TestUnary_Div_Trunc: + def test_run_unary_div_trunc( + self, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "eltwise-unary_div_trunc", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py new file mode 100644 index 00000000000..d985c4d90c7 --- /dev/null +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023-24 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch +import random +import numpy as np +from functools import partial +import tt_lib as ttl +from tests.tt_eager.python_api_testing.sweep_tests import ( + comparison_funcs, + generation_funcs, +) +from tests.tt_eager.python_api_testing.sweep_tests.run_pytorch_ci_tests import ( + run_single_pytorch_test, +) +from models.utility_functions import skip_for_grayskull + +mem_configs = [ + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.DRAM), + ttl.tensor.MemoryConfig(ttl.tensor.TensorMemoryLayout.INTERLEAVED, ttl.tensor.BufferType.L1), +] + + +@pytest.mark.parametrize( + "input_shapes", + [ + [[1, 1, 32, 32], [1, 1, 32, 32]], + [[1, 1, 320, 384], [1, 1, 320, 384]], + [[1, 3, 320, 384], [1, 3, 320, 384]], + ], +) +@pytest.mark.parametrize( + "dst_mem_config", + mem_configs, +) +@skip_for_grayskull("#ToDo: GS implementation needs to be done for floor") +class TestUnary_Rdiv_Trunc: + def test_run_unary_rdiv_trunc( + self, + input_shapes, + dst_mem_config, + device, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + + run_single_pytorch_test( + "eltwise-unary_rdiv_trunc", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index 341315808a7..80b51c459c7 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -651,10 +651,19 @@ def floor(x, *args, **kwargs): return torch.floor(x) +def ceil(x, *args, **kwargs): + return torch.ceil(x) + + def trunc(x, *args, **kwargs): return torch.trunc(x) +def frac(x, *args, **kwargs): + result = torch.frac(x) + return result + + def floor_div(x, y, *args, **kwargs): result = torch.floor_divide(x, y) return result @@ -666,6 +675,12 @@ def unary_floor_div(x, *args, **kwargs): return result +def rfloor_div(x, *args, **kwargs): + value = kwargs.pop("value") + result = torch.floor_divide(value, x) + return result + + def round(x, *args, **kwargs): decimals = kwargs.pop("decimals") result = torch.round(x, decimals=decimals) @@ -753,6 +768,23 @@ def div(x, y, *args, accurate_mode, round_mode, **kwargs): return torch.div(x, y, rounding_mode=round_mode) +def div_trunc(x, y, *args, **kwargs): + result = torch.div(x, y, rounding_mode="trunc") + return result + + +def unary_div_trunc(x, *args, **kwargs): + value = kwargs.pop("value") + result = torch.div(x, value, rounding_mode="trunc") + return result + + +def unary_rdiv_trunc(x, *args, **kwargs): + value = kwargs.pop("value") + result = torch.trunc(value / x) + return result + + def div_no_nan(x, y, *args, **kwargs): result = torch.where(y == 0, 0, x / y) return result diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 58748607f4b..c420e41e9d5 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -1114,6 +1114,24 @@ def eltwise_unary_floor_div( return tt2torch_tensor(t1) +@setup_host_and_device +def eltwise_rfloor_div( + x, + *args, + value, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.rfloor_div(value, t0, output_mem_config=output_mem_config) + + return tt2torch_tensor(t1) + + @setup_host_and_device def eltwise_round( x, @@ -1169,6 +1187,42 @@ def eltwise_unary_div_no_nan( return tt2torch_tensor(t1) +@setup_host_and_device +def eltwise_unary_div_trunc( + x, + *args, + value, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.div_trunc(t0, value, output_mem_config=output_mem_config) + + return tt2torch_tensor(t1) + + +@setup_host_and_device +def eltwise_unary_rdiv_trunc( + x, + *args, + value, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.unary_rdiv_trunc(value, t0, output_mem_config=output_mem_config) + + return tt2torch_tensor(t1) + + @setup_host_and_device def lamb_optimizer( x, @@ -2591,7 +2645,9 @@ def unary_op( transpose_nw = make_unary_op(partial(ttl.tensor.transpose, dim0=0, dim1=-1)) transpose_cw = make_unary_op(partial(ttl.tensor.transpose, dim0=1, dim1=-1)) eltwise_floor = make_unary_op(ttl.tensor.floor) +eltwise_ceil = make_unary_op(ttl.tensor.ceil) eltwise_trunc = make_unary_op(ttl.tensor.trunc) +eltwise_frac = make_unary_op(ttl.tensor.frac) @setup_host_and_device @@ -2659,6 +2715,7 @@ def binary_op( eltwise_add = make_binary_op_ttnn(ttnn.add) eltwise_sub = make_binary_op_ttnn(ttnn.sub) eltwise_mul = make_binary_op_ttnn(ttnn.mul) +eltwise_div_trunc = make_binary_op(ttl.tensor.div_trunc) eltwise_squared_difference = make_binary_op_ttnn(ttnn.squared_difference) eltwise_hypot = make_binary_op_ttnn(ttnn.hypot) eltwise_atan2 = make_binary_op_ttnn(ttnn.atan2) diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp index c5b268cd615..bf6e79fd44a 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp @@ -994,6 +994,65 @@ Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config) { return operation::decorate_as_composite(__func__, _trunc)(input, output_mem_config); } +Tensor _frac(const Tensor& input, const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + Tensor trunc_res = trunc(input, output_mem_config); + Tensor result = ttnn::subtract(input, trunc_res, std::nullopt, output_mem_config); + return result; +} +Tensor frac(const Tensor& input, const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _frac)(input, output_mem_config); +} + +Tensor _div_trunc( + const Tensor& input_a, + const Tensor& input_b, + const MemoryConfig& output_mem_config) { + auto arch = input_a.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + Tensor result = div(input_a, input_b, true); + return trunc(result); +} +Tensor div_trunc( + const Tensor& input_a, + const Tensor& input_b, + const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _div_trunc)(input_a, input_b, output_mem_config); +} + +Tensor _div_trunc_overload( + const Tensor& input, + float value, + const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + Tensor result = div_unary(input, value); + return trunc(result); +} +Tensor div_trunc( + const Tensor& input, + float value, + const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _div_trunc_overload)(input, value, output_mem_config); +} + +Tensor _unary_rdiv_trunc( + float value, + const Tensor& input, + const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); + Tensor result = div_unary(value, input); + return trunc(result); +} +Tensor unary_rdiv_trunc( + float value, + const Tensor& input, + const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _unary_rdiv_trunc)(value, input, output_mem_config); +} + Tensor is_odd(const Tensor& input, const MemoryConfig& output_mem_config) { Tensor result = ttnn::multiply(input, (1.0f/2.0f)); Tensor floor_res = ttnn::floor(result); @@ -1068,6 +1127,14 @@ Tensor floor_div(const Tensor& input_a, float value, const MemoryConfig& output_ return operation::decorate_as_composite(__func__, _floor_div_overload)(input_a, value, output_mem_config); } +Tensor _rfloor_div(float value, const Tensor& input, const MemoryConfig& output_mem_config) { + Tensor result = div_unary(value, input); + return floor(result, output_mem_config); +} +Tensor rfloor_div(float value, const Tensor& input, const MemoryConfig& output_mem_config) { + return operation::decorate_as_composite(__func__, _rfloor_div)(value, input, output_mem_config); +} + Tensor _div_no_nan(const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) { Tensor div_result = div(input_a, input_b); return where(ttnn::eqz(input_b, output_mem_config), 0, div_result); diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp index 87fef28d06c..4436a076192 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp @@ -155,6 +155,21 @@ Tensor div( string round_mode = "None", const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +Tensor div_trunc( + const Tensor& input_a, + const Tensor& input_b, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + +Tensor div_trunc( + const Tensor& input, + float value, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + +Tensor unary_rdiv_trunc( + float value, + const Tensor& input, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + Tensor div_no_nan( const Tensor& input_a, const Tensor& input_b, @@ -177,6 +192,10 @@ Tensor fmod( Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +Tensor frac( + const Tensor& input, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + Tensor round( const Tensor& input, int64_t decimals = 0, @@ -192,6 +211,11 @@ Tensor floor_div( float value, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +Tensor rfloor_div( + float value, + const Tensor& input, + const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + // xlogy(x,y))=x*log(y) Tensor xlogy( const Tensor& input_a, diff --git a/tt_eager/tt_dnn/op_library/conv/conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/conv_op.cpp index d581a825630..3621c1db75c 100644 --- a/tt_eager/tt_dnn/op_library/conv/conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/conv_op.cpp @@ -34,10 +34,10 @@ pair, vector> compute_conv_activation_as_mm_shape(Sha // pad height uint32_t num_rows = (uint32_t) conv_output_h*conv_output_w; uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT; - uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w; uint32_t act_block_w_datums = act_block_w_ntiles * TILE_WIDTH; - uint32_t num_cols_padded = (uint32_t) (ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums); + uint32_t num_cols_padded = (uint32_t) (std::ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums); if(use_fast_reader) { assert(act_block_w_datums >= conv_activation_shape[3] * filter_w); num_cols_padded = act_block_w_datums * filter_h; @@ -218,7 +218,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_single_core_(const Tensor& a, uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df; @@ -726,7 +726,7 @@ std::pair, vector> generate_conv_weight_address_map( address_map_metadata.push_back(address_map_current_group_dram_address_offset); address_map_metadata.push_back(address_map_current_group_size); // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8); + uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8); if(address_map_current_group_size_padded != address_map_current_group_size) { assert(address_map_current_group_size_padded > address_map_current_group_size); address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0); @@ -764,8 +764,8 @@ std::pair, vector> generate_conv_activation_address_m int conv_output_w = ((conv_input_y - S + (2 * Pad_W)) / V) + 1; uint32_t matrix_height_unpadded = conv_output_h * conv_output_w; uint32_t matrix_width_unpadded = conv_input_z * R * S; - uint32_t matrix_height = (uint32_t) (ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums); - uint32_t matrix_width = (uint32_t) (ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums); + uint32_t matrix_height = (uint32_t) (std::ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t matrix_width = (uint32_t) (std::ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums); uint32_t num_groups = num_blocks_act_h * num_blocks_act_w * num_blocks_weight_w; uint32_t channel_stick_size = conv_input_z; @@ -854,7 +854,7 @@ std::pair, vector> generate_conv_activation_address_m address_map_metadata.push_back(address_map_current_group_dram_address_offset); address_map_metadata.push_back(address_map_current_group_size); // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8); + uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8); if(address_map_current_group_size_padded != address_map_current_group_size) { assert(address_map_current_group_size_padded > address_map_current_group_size); address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0); @@ -903,7 +903,7 @@ std::pair, vector> populate_address_map_vectors_for_r address_map_raw_current_group_start + current_group_size); address_map_raw_index += current_group_size; // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t current_group_size_padded = (uint32_t) (ceil((double) current_group_size / (double) 8) * 8); + uint32_t current_group_size_padded = (uint32_t) (std::ceil((double) current_group_size / (double) 8) * 8); if(current_group_size_padded != current_group_size) { assert(current_group_size_padded > current_group_size); address_map.insert(address_map.end(), current_group_size_padded - current_group_size, 0); @@ -988,7 +988,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_with_address_map_single_core_( // it removes the padding done for block width but it doesn't remove padding done for tiled width uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= Wb); - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df; diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp index 6ca3178702f..5a0a8a9814f 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp @@ -262,7 +262,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_(const Tensor& a, cons uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp index 2e05661b491..f46cdca2a1b 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp @@ -278,7 +278,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_(const Tensor& uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp index 97fe8aea401..a352d53ba7c 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp @@ -442,7 +442,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; uint32_t num_blocks_output_w = - (uint32_t)ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); + (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp index 725fcba2499..1f363d4c750 100644 --- a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp @@ -40,7 +40,7 @@ pair, vector> compute_opt_conv_activation_as_mm_shape // pad height uint32_t num_rows = (uint32_t) batch_size * conv_output_h * conv_output_w; uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT; - uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w; uint32_t num_cols_padded = round_up(conv_activation_shape[3] * filter_w, TILE_WIDTH) * filter_h; return {{1, num_rows_padded, num_cols_padded}, {1, num_rows, num_cols}}; diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp index b5bab4f7f01..2f2592d6f96 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp @@ -74,6 +74,7 @@ void update_macro_defines(UnaryOpType op_type, std::map get_op_init_and_func_default(UnaryOpType op_type, stri op_init_and_name = {"signbit_tile_init();", fmt::format("signbit_tile({});", idst)}; break; case UnaryOpType::FLOOR: op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile({});", idst)}; break; + case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break; case UnaryOpType::SIN: op_init_and_name = {"sin_tile_init();", fmt::format("sin_tile({});", idst)}; break; case UnaryOpType::COS: op_init_and_name = {"cos_tile_init();", fmt::format("cos_tile({});", idst)}; break; case UnaryOpType::ISFINITE: @@ -371,6 +373,7 @@ inline void validate_supported_arch_dtype(tt::ARCH arch, DataType input_datatype switch (op_type) { case UnaryOpType::REMAINDER: case UnaryOpType::FLOOR: + case UnaryOpType::CEIL: case UnaryOpType::LEFT_SHIFT: case UnaryOpType::RIGHT_SHIFT: TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index 6226b08ffa9..8369a6ce5a8 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -87,7 +87,8 @@ enum class UnaryOpType { FLOOR, LEFT_SHIFT, REMAINDER, - FMOD + FMOD, + CEIL }; template @@ -582,6 +583,7 @@ constexpr auto isneginf = make_eltwise_unary{}; constexpr auto isnan = make_eltwise_unary{}; constexpr auto signbit = make_eltwise_unary{}; constexpr auto floor = make_eltwise_unary{}; +constexpr auto ceil = make_eltwise_unary{}; constexpr auto atan = make_eltwise_unary{}; constexpr auto nez = make_eltwise_unary{}; constexpr auto logical_not_unary = make_eltwise_unary{}; diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp index 15b5f6a2533..a176a863cf5 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp @@ -39,8 +39,8 @@ operation::ProgramWithCallbacks eltwise_unary_sharded(const Tensor &input, Tenso uint32_t num_tile_per_core = 0; if (input.get_dtype() == DataType::BFLOAT8_B) { - uint32_t ntiles_along_width = ceil(shard_spec.shape[1] / (float) constants::TILE_WIDTH); - uint32_t ntiles_along_height = ceil(shard_spec.shape[0] / (float) constants::TILE_HEIGHT); + uint32_t ntiles_along_width = std::ceil(shard_spec.shape[1] / (float) constants::TILE_WIDTH); + uint32_t ntiles_along_height = std::ceil(shard_spec.shape[0] / (float) constants::TILE_HEIGHT); num_tile_per_core = ntiles_along_width * ntiles_along_height; } else { TT_FATAL((shard_spec.shape[1] * datum_size(act_df)) % L1_ALIGNMENT == 0, "Shard width should be multiple of L1_ADRESS_ALIGNMENT"); diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp index 0fb889f6075..ef4cc32d759 100644 --- a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp +++ b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp @@ -9,7 +9,7 @@ #include "tt_dnn/op_library/sliding_window_op_infra/sliding_window.hpp" inline uint32_t ceil_multiple_of(uint32_t n, uint32_t m) { - return (uint32_t) ceil((float) n / m) * m; + return (uint32_t) std::ceil((float) n / m) * m; } namespace tt { diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp index a6174885043..282274d040e 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_composite_ops.cpp @@ -1184,13 +1184,56 @@ void TensorModuleCompositeOPs(py::module& m_tensor) { "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" )doc"); - m_tensor.def( - "div_no_nan", - py::overload_cast(&div_no_nan), - py::arg("input_a").noconvert(), - py::arg("input_b").noconvert(), - py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, - R"doc( + m_tensor.def("div_trunc", py::overload_cast(&div_trunc), + py::arg("input_a").noconvert(), py::arg("input_b").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Performs the element-wise division of tensors ``input_a`` by ``input_b`` and rounds the result using trunc mode. Support provided only for Wormhole_B0. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "input_a", "Numerator Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "input_b", "Denominator Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + + m_tensor.def("div_trunc", py::overload_cast(&div_trunc), + py::arg("input").noconvert(), py::arg("value").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Performs the element-wise division of a tensor ``input`` by scalar ``value`` and rounds the result using trunc mode. Support provided only for Wormhole_B0. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "input", "Numerator Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "value", "Denominator value", "float", "", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + + m_tensor.def("unary_rdiv_trunc", py::overload_cast(&unary_rdiv_trunc), + py::arg("value").noconvert(), py::arg("input").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( + Performs the element-wise division of a scalar ``value`` by a tensor ``input`` and rounds the result using trunc mode. Support provided only for Wormhole_B0. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "value", "Numerator value", "float", "", "Yes" + "input", "Denominator Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + + m_tensor.def("div_no_nan", py::overload_cast(&div_no_nan), + py::arg("input_a").noconvert(), py::arg("input_b").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( Performs the element-wise div_no_nan on two tensors ``input_a`` and ``input_b``, which returns 0 if ``input_b`` (denominator) is zero. Input tensor must have BFLOAT16 data type. @@ -1273,6 +1316,21 @@ void TensorModuleCompositeOPs(py::module& m_tensor) { "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" )doc"); + m_tensor.def("frac",&frac, + py::arg("input").noconvert(),py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,R"doc( + Performs the element-wise frac operation on ``input``. Support provided only for Wormhole_B0. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "input", "Input Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + m_tensor.def("round",&round, py::arg("input").noconvert(),py::arg("decimals"),py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,R"doc( Performs the element-wise round operation on ``input`` , to the given number of ``decimals`` places. Support provided only for Wormhole_B0 and ``decimals = 0``. @@ -1321,6 +1379,22 @@ void TensorModuleCompositeOPs(py::module& m_tensor) { "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" )doc"); + m_tensor.def("rfloor_div", py::overload_cast(&rfloor_div), + py::arg("value").noconvert(), py::arg("input").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,R"doc( + Performs the element-wise floor division of a scalar ``value`` by a tensor ``input``. Support provided only for Wormhole_B0. + + Input tensor must have BFLOAT16 data type. + + Output tensor will have BFLOAT16 data type. + + .. csv-table:: + :header: "Argument", "Description", "Data type", "Valid range", "Required" + + "value", "Numerator value", "float", "", "Yes" + "input", "Denominator Tensor", "Tensor", "Tensor of shape [W, Z, Y, X]", "Yes" + "output_mem_config", "Layout of tensor in TT Accelerator device memory banks", "MemoryConfig", "Default is interleaved in DRAM", "No" + )doc"); + m_tensor.def("mac", py::overload_cast(&mac), py::arg("input").noconvert(), py::arg("tensor1").noconvert(), py::arg("tensor2").noconvert(), py::arg("output_mem_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, R"doc( Returns tensor with the multiply and accumulation of all of elements of the input tensors ``input, tensor1, tensor2``. diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 8eb51c3bd9c..f0c3a1e1918 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -43,6 +43,7 @@ namespace tt::tt_metal::detail { ); detail::bind_unary_op(m_tensor, "signbit", signbit, R"doc(Applies the signbit function to the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "floor", floor, R"doc(Applies floor to the elements of the input tensor ``{0}``. Support provided only for Wormhole_B0.)doc"); + detail::bind_unary_op(m_tensor, "ceil", ceil, R"doc(Applies ceil to the elements of the input tensor ``{0}``. Support provided only for Wormhole_B0.)doc"); detail::bind_unary_op(m_tensor, "atan", atan, R"doc(Returns a new tensor with the arctan of the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "asin", asin, R"doc(Returns a new tensor with the arcsine of the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "acos", acos, R"doc(Returns a new tensor with the arccosine of the elements of the input tensor ``{0}``.)doc"); diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 58f4aad94a4..33489e62388 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -19,6 +19,7 @@ #include "llk_math_eltwise_unary_sfpu_sign.h" #include "llk_math_eltwise_unary_sfpu_signbit.h" #include "llk_math_eltwise_unary_sfpu_floor.h" +#include "llk_math_eltwise_unary_sfpu_ceil.h" #include "llk_math_eltwise_unary_sfpu_silu.h" #include "llk_math_eltwise_unary_sfpu_square.h" #include "llk_math_eltwise_unary_sfpu_tanh.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h new file mode 100644 index 00000000000..1f9b0af0a19 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" +#include "noc_nonblocking_api.h" +#include "limits.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_ceil() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat input = dst_reg[0]; + vFloat result; + + v_if (input <= SHRT_MIN || input > SHRT_MAX) { + result = input; + } + v_endif; + + v_if (input > SHRT_MIN && input <= SHRT_MAX) { + vInt tmp = float_to_int16(input); //TODO: Replace float_to_int16 to float_to_int32 once it is available + result = int32_to_float(tmp); + } + v_endif; + + dst_reg[0] = result; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h new file mode 100644 index 00000000000..f3db9269e6e --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_params.h" +#include "ckernel_sfpu_ceil.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_ceil_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_params + (ckernel::sfpu::calculate_ceil, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h index 85bff7d36c0..8aafadf0ff3 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h @@ -84,5 +84,6 @@ enum SfpuType { left_shift, remainder, fmod, + ceil, unused, }; diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h new file mode 100644 index 00000000000..f7d069c275c --- /dev/null +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_unary_sfpu_ceil.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + + + +namespace ckernel { + +/** + * Please refer to documentation for any_init. + */ +ALWI void ceil_tile_init() { + MATH(( llk_math_eltwise_unary_sfpu_ceil_init() )); +} + +/** + * Performs ceil operation on each row of a tile. + * in DST register at index tile_index. The DST register buffer must be in + * acquired state via *acquire_dst* call. This call is blocking and is only + * available on the compute engine. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst | The index of the tile in DST register buffer to modify the sign bit of | uint32_t | Must be less than the size of the DST register buffer | True | + */ +ALWI void ceil_tile(uint32_t idst) { + MATH(( llk_math_eltwise_unary_sfpu_ceil(idst) )); +} + + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h index 3cb04a911be..7a7f116194d 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h @@ -104,6 +104,10 @@ #include "compute_kernel_api/eltwise_unary/fmod.h" #endif +#if SFPU_OP_CEIL_INCLUDE +#include "compute_kernel_api/eltwise_unary/ceil.h" +#endif + #if SFPU_OP_BINOP_WITH_SCALAR_INCLUDE #include "compute_kernel_api/eltwise_unary/binop_with_scalar.h" #endif diff --git a/ttnn/cpp/ttnn/operations/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv2d.cpp index 8d650110b55..c6913670e95 100644 --- a/ttnn/cpp/ttnn/operations/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv2d.cpp @@ -82,7 +82,7 @@ ParallelConfig determine_parallel_config( uint32_t total_cores_for_channels = block_shard_orientation == ShardOrientation::COL_MAJOR ? device_grid_size[1] : device_grid_size[0]; uint32_t num_cores_channels = find_closest_common_largest_divisor( - conv_out_2d_matrix_width_ntiles, ceil((double)input_channels / (double)32), total_cores_for_channels); + conv_out_2d_matrix_width_ntiles, std::ceil((double)input_channels / (double)32), total_cores_for_channels); uint32_t cores_x = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_nhw : num_cores_channels; uint32_t cores_y = diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/device/unary_backward_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/device/unary_backward_op.cpp index eea911c5328..1d1b2be641d 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/device/unary_backward_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/device/unary_backward_op.cpp @@ -90,7 +90,7 @@ std::vector _add_bw( std::vector _unary_comp_bw(const Tensor& grad, const MemoryConfig& output_mem_config) { std::vector grad_tensor; - Tensor zero_grad = tt::tt_metal::zeros_like(grad, output_mem_config); + Tensor zero_grad = ttnn::operations::creation::zeros_like(grad, grad.get_dtype(), grad.get_layout(), std::nullopt, output_mem_config); grad_tensor.emplace_back(zero_grad); return grad_tensor; } @@ -121,7 +121,7 @@ std::vector _frac_bw(const Tensor& grad, const Tensor& input, const Memo std::vector _trunc_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config) { std::vector grad_tensor; - Tensor grad_result = tt::tt_metal::zeros_like(grad, output_mem_config); + Tensor grad_result = ttnn::operations::creation::zeros_like(grad, grad.get_dtype(), grad.get_layout(), std::nullopt, output_mem_config); grad_tensor.emplace_back(grad_result); return grad_tensor; } @@ -147,7 +147,7 @@ std::vector _log_sigmoid_bw(const Tensor& grad, const Tensor& input, con std::vector _fill_zero_bw(const Tensor& grad, const Tensor& input, const MemoryConfig& output_mem_config) { std::vector grad_tensor; - Tensor result = tt::tt_metal::zeros_like(grad, output_mem_config); + Tensor result = ttnn::operations::creation::zeros_like(grad, grad.get_dtype(), grad.get_layout(), std::nullopt, output_mem_config); grad_tensor.emplace_back(result); return grad_tensor; } @@ -254,7 +254,7 @@ std::vector _hardsigmoid_bw(const Tensor& grad, const Tensor& input, con ttnn::ge(input, 3, std::nullopt, output_mem_config), std::nullopt, output_mem_config), - tt::tt_metal::zeros_like(input, output_mem_config), + ttnn::operations::creation::zeros_like(input, input.get_dtype(), input.get_layout(), std::nullopt, output_mem_config), ttnn::multiply(grad, 1.0 / 6), output_mem_config); grad_tensor.emplace_back(grad_a); @@ -389,7 +389,7 @@ std::vector _hardshrink_bw( const Tensor& grad, const Tensor& input_tensor, float lambd, const MemoryConfig& output_mem_config) { std::vector grad_tensor; Tensor hardshrink_result = hardshrink(input_tensor, lambd, output_mem_config); - Tensor result = where(eqz(hardshrink_result, output_mem_config), 0.0f, grad, output_mem_config); + Tensor result = where(ttnn::eqz(hardshrink_result, output_mem_config), 0.0f, grad, output_mem_config); grad_tensor.emplace_back(result); return grad_tensor; } @@ -420,7 +420,7 @@ std::vector _leaky_relu_bw( const Tensor& grad, const Tensor& input, float negative_slope, const MemoryConfig& output_mem_config) { std::vector grad_tensor; Tensor grad_result = where( - gtz(input, output_mem_config), grad, ttnn::multiply(grad, negative_slope, std::nullopt, output_mem_config), output_mem_config); + ttnn::gtz(input, output_mem_config), grad, ttnn::multiply(grad, negative_slope, std::nullopt, output_mem_config), output_mem_config); grad_tensor.emplace_back(grad_result); return grad_tensor; } @@ -432,7 +432,7 @@ std::vector _elu_bw( const Tensor& grad, const Tensor& input, float alpha, const MemoryConfig& output_mem_config) { std::vector grad_tensor; Tensor grad_result = where( - gez(input, output_mem_config), + ttnn::gez(input, output_mem_config), grad, ttnn::multiply(grad, ttnn::multiply(ttnn::exp(input, false, output_mem_config), alpha, std::nullopt, output_mem_config), std::nullopt, output_mem_config), output_mem_config); @@ -447,7 +447,7 @@ std::vector _celu_bw( const Tensor& grad, const Tensor& input, float alpha, const MemoryConfig& output_mem_config) { std::vector grad_tensor; Tensor div_result = ttnn::multiply( - input, recip(ttnn::operations::creation::full_like(input, alpha, input.get_dtype(), input.get_layout(), std::nullopt, output_mem_config), output_mem_config), std::nullopt, output_mem_config); + input, ttnn::reciprocal(ttnn::operations::creation::full_like(input, alpha, input.get_dtype(), input.get_layout(), std::nullopt, output_mem_config), output_mem_config), std::nullopt, output_mem_config); Tensor exp_result = ttnn::exp(div_result, false, output_mem_config); Tensor grad_result = where( ttnn::gt(input, ttnn::operations::creation::zeros_like(input, input.get_dtype(), input.get_layout(), std::nullopt, output_mem_config), std::nullopt, output_mem_config), @@ -627,7 +627,7 @@ std::vector _atanh_bw(const Tensor& grad, const Tensor& input, const Mem ttnn::eq(input, -1, std::nullopt, output_mem_config), std::nullopt, output_mem_config), - nez(grad, output_mem_config)), + ttnn::nez(grad, output_mem_config)), t_inf, grad_a, output_mem_config);