From e3d0f903dbbc3657a86eb69ff31c073c0106e7c3 Mon Sep 17 00:00:00 2001 From: mouliraj-mcw Date: Tue, 18 Jun 2024 07:29:15 +0000 Subject: [PATCH] #8681: Add ceil op --- docs/source/ttnn/ttnn/dependencies/tt_lib.rst | 2 + .../python_api_testing/sweep_tests/op_map.py | 6 ++- .../pytests/tt_dnn/test_div_trunc.py | 4 +- .../pytests/tt_dnn/test_eltwise_unary.py | 6 +-- .../sweep_tests/pytests/tt_dnn/test_frac.py | 8 ++-- .../pytests/tt_dnn/test_rfloor_div.py | 11 ++--- .../pytests/tt_dnn/test_unary_div_trunc.py | 11 ++--- .../pytests/tt_dnn/test_unary_rdiv_trunc.py | 11 ++--- .../sweep_tests/pytorch_ops.py | 4 ++ .../sweep_tests/tt_lib_ops.py | 1 + .../op_library/composite/composite_ops.cpp | 10 ++++- .../op_library/composite/composite_ops.hpp | 4 +- tt_eager/tt_dnn/op_library/conv/conv_op.cpp | 18 ++++---- .../optimized_conv_op.cpp | 2 +- .../optimized_conv_op_sharded.cpp | 2 +- .../optimized_conv_op_sharded_v2.cpp | 2 +- .../op_library/conv/optimized_conv_op.cpp | 2 +- .../eltwise_unary/eltwise_unary_op.cpp | 3 ++ .../eltwise_unary/eltwise_unary_op.hpp | 4 +- .../multi_core/eltwise_unary_op_sharded.cpp | 4 +- tt_eager/tt_dnn/op_library/pool/max_pool.hpp | 2 +- .../csrc/tt_lib_bindings_tensor_xary_ops.cpp | 1 + .../metal/llk_api/llk_math_unary_sfpu_api.h | 1 + .../llk_api/llk_sfpu/ckernel_sfpu_ceil.h | 43 ++++++++++++++++++ .../llk_math_eltwise_unary_sfpu_ceil.h | 27 +++++++++++ .../metal/llk_api/llk_sfpu_types.h | 1 + .../compute_kernel_api/eltwise_unary/ceil.h | 45 +++++++++++++++++++ .../eltwise_unary/sfpu_split_includes.h | 4 ++ ttnn/cpp/ttnn/operations/conv2d.cpp | 2 +- 29 files changed, 187 insertions(+), 54 deletions(-) create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h create mode 100644 tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst index d8a2d40d05fd..ab2cacd3c244 100644 --- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst +++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst @@ -458,6 +458,8 @@ Tensor elementwise operations .. autofunction:: tt_lib.tensor.floor +.. autofunction:: tt_lib.tensor.ceil + .. autofunction:: tt_lib.tensor.trunc .. autofunction:: tt_lib.tensor.frac diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py index 5cb289a630e6..7797eacc0def 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py @@ -632,6 +632,10 @@ "tt_op": tt_lib_ops.eltwise_floor, "pytorch_op": pytorch_ops.floor, }, + "eltwise-ceil": { + "tt_op": tt_lib_ops.eltwise_ceil, + "pytorch_op": pytorch_ops.ceil, + }, "eltwise-trunc": { "tt_op": tt_lib_ops.eltwise_trunc, "pytorch_op": pytorch_ops.trunc, @@ -648,7 +652,7 @@ "tt_op": tt_lib_ops.eltwise_unary_floor_div, "pytorch_op": pytorch_ops.unary_floor_div, }, - "eltwise-_rfloor_div": { + "eltwise-rfloor_div": { "tt_op": tt_lib_ops.eltwise_rfloor_div, "pytorch_op": pytorch_ops.rfloor_div, }, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py index 60f05eb9f59a..21d8f3609c07 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py @@ -45,9 +45,9 @@ def test_run_div_trunc( device, ): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] + [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] test_args.update({"output_mem_config": dst_mem_config}) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py index 6b6d6e6a4aaa..9bdf8d5e1826 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py @@ -585,7 +585,7 @@ def test_run_eltwise_sign_ops( test_args, ) - @pytest.mark.parametrize("round_off_method", ["floor", "trunc"]) + @pytest.mark.parametrize("round_off_method", ["floor", "ceil", "trunc"]) @skip_for_grayskull("#ToDo: GS implementation needs to be done for Floor") def test_run_eltwise_round_off_ops( self, @@ -597,9 +597,7 @@ def test_run_eltwise_round_off_ops( output_mem_config, ): datagen_func = [ - generation_funcs.gen_func_with_cast( - partial(generation_funcs.gen_rand, low=-1000, high=1000), torch.bfloat16 - ) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] test_args.update( diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py index ee2862d5004c..083fabc783f0 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py @@ -26,9 +26,9 @@ @pytest.mark.parametrize( "input_shapes", [ - [[1, 1, 32, 32], [1, 1, 32, 32]], - [[1, 1, 320, 384], [1, 1, 320, 384]], - [[1, 3, 320, 384], [1, 3, 320, 384]], + [[1, 1, 32, 32]], + [[1, 1, 320, 384]], + [[1, 3, 320, 384]], ], ) @pytest.mark.parametrize( @@ -44,7 +44,7 @@ def test_run_frac( device, ): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] test_args.update({"output_mem_config": dst_mem_config}) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py index 051cbdafe9ad..cb05219d1fb1 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py @@ -4,6 +4,7 @@ import pytest import torch import random +import numpy as np from functools import partial import tt_lib as ttl from tests.tt_eager.python_api_testing.sweep_tests import ( @@ -29,10 +30,6 @@ [[1, 3, 320, 384], [1, 3, 320, 384]], ], ) -@pytest.mark.parametrize( - "value", - [-5.1, 0.0, 10.9], -) @pytest.mark.parametrize( "dst_mem_config", mem_configs, @@ -42,17 +39,17 @@ class TestRfloor_div: def test_run_rfloor_div( self, input_shapes, - value, dst_mem_config, device, ): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] - test_args.update({"value": value}) + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) test_args.update({"output_mem_config": dst_mem_config}) comparison_func = comparison_funcs.comp_pcc + run_single_pytorch_test( "eltwise-rfloor_div", input_shapes, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py index 6f288e24d70a..f93da898418b 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py @@ -4,6 +4,7 @@ import pytest import torch import random +import numpy as np from functools import partial import tt_lib as ttl from tests.tt_eager.python_api_testing.sweep_tests import ( @@ -29,10 +30,6 @@ [[1, 3, 320, 384], [1, 3, 320, 384]], ], ) -@pytest.mark.parametrize( - "value", - [-5.1, 0.0, 10.9], -) @pytest.mark.parametrize( "dst_mem_config", mem_configs, @@ -42,17 +39,17 @@ class TestUnary_Div_Trunc: def test_run_unary_div_trunc( self, input_shapes, - value, dst_mem_config, device, ): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] - test_args.update({"value": value}) + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) test_args.update({"output_mem_config": dst_mem_config}) comparison_func = comparison_funcs.comp_pcc + run_single_pytorch_test( "eltwise-unary_div_trunc", input_shapes, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py index 6a734c6db4cb..d985c4d90c78 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py @@ -4,6 +4,7 @@ import pytest import torch import random +import numpy as np from functools import partial import tt_lib as ttl from tests.tt_eager.python_api_testing.sweep_tests import ( @@ -29,10 +30,6 @@ [[1, 3, 320, 384], [1, 3, 320, 384]], ], ) -@pytest.mark.parametrize( - "value", - [-5.1, 0.0, 10.9], -) @pytest.mark.parametrize( "dst_mem_config", mem_configs, @@ -42,17 +39,17 @@ class TestUnary_Rdiv_Trunc: def test_run_unary_rdiv_trunc( self, input_shapes, - value, dst_mem_config, device, ): datagen_func = [ - generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) + generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16) ] test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] - test_args.update({"value": value}) + test_args.update({"value": random.uniform(-100, 100) for _ in range(5)}) test_args.update({"output_mem_config": dst_mem_config}) comparison_func = comparison_funcs.comp_pcc + run_single_pytorch_test( "eltwise-unary_rdiv_trunc", input_shapes, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index d8c079d6181b..80b51c459c70 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -651,6 +651,10 @@ def floor(x, *args, **kwargs): return torch.floor(x) +def ceil(x, *args, **kwargs): + return torch.ceil(x) + + def trunc(x, *args, **kwargs): return torch.trunc(x) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index b001ce3dc268..c420e41e9d5f 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -2645,6 +2645,7 @@ def unary_op( transpose_nw = make_unary_op(partial(ttl.tensor.transpose, dim0=0, dim1=-1)) transpose_cw = make_unary_op(partial(ttl.tensor.transpose, dim0=1, dim1=-1)) eltwise_floor = make_unary_op(ttl.tensor.floor) +eltwise_ceil = make_unary_op(ttl.tensor.ceil) eltwise_trunc = make_unary_op(ttl.tensor.trunc) eltwise_frac = make_unary_op(ttl.tensor.frac) diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp index 3574a058f325..bf6e79fd44a4 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp @@ -995,8 +995,10 @@ Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config) { } Tensor _frac(const Tensor& input, const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); Tensor trunc_res = trunc(input, output_mem_config); - Tensor result = sub(input, trunc_res, std::nullopt, output_mem_config); + Tensor result = ttnn::subtract(input, trunc_res, std::nullopt, output_mem_config); return result; } Tensor frac(const Tensor& input, const MemoryConfig& output_mem_config) { @@ -1007,6 +1009,8 @@ Tensor _div_trunc( const Tensor& input_a, const Tensor& input_b, const MemoryConfig& output_mem_config) { + auto arch = input_a.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); Tensor result = div(input_a, input_b, true); return trunc(result); } @@ -1021,6 +1025,8 @@ Tensor _div_trunc_overload( const Tensor& input, float value, const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); Tensor result = div_unary(input, value); return trunc(result); } @@ -1035,6 +1041,8 @@ Tensor _unary_rdiv_trunc( float value, const Tensor& input, const MemoryConfig& output_mem_config) { + auto arch = input.device()->arch(); + TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); Tensor result = div_unary(value, input); return trunc(result); } diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp index fbb11f7a9b30..4436a0761923 100644 --- a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp +++ b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp @@ -190,13 +190,11 @@ Tensor fmod( const Tensor& input_b, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); -<<<<<<< HEAD Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); -======= + Tensor frac( const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); ->>>>>>> #8681: Add frac op Tensor round( const Tensor& input, diff --git a/tt_eager/tt_dnn/op_library/conv/conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/conv_op.cpp index d581a8256308..3621c1db75c9 100644 --- a/tt_eager/tt_dnn/op_library/conv/conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/conv_op.cpp @@ -34,10 +34,10 @@ pair, vector> compute_conv_activation_as_mm_shape(Sha // pad height uint32_t num_rows = (uint32_t) conv_output_h*conv_output_w; uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT; - uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w; uint32_t act_block_w_datums = act_block_w_ntiles * TILE_WIDTH; - uint32_t num_cols_padded = (uint32_t) (ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums); + uint32_t num_cols_padded = (uint32_t) (std::ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums); if(use_fast_reader) { assert(act_block_w_datums >= conv_activation_shape[3] * filter_w); num_cols_padded = act_block_w_datums * filter_h; @@ -218,7 +218,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_single_core_(const Tensor& a, uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df; @@ -726,7 +726,7 @@ std::pair, vector> generate_conv_weight_address_map( address_map_metadata.push_back(address_map_current_group_dram_address_offset); address_map_metadata.push_back(address_map_current_group_size); // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8); + uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8); if(address_map_current_group_size_padded != address_map_current_group_size) { assert(address_map_current_group_size_padded > address_map_current_group_size); address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0); @@ -764,8 +764,8 @@ std::pair, vector> generate_conv_activation_address_m int conv_output_w = ((conv_input_y - S + (2 * Pad_W)) / V) + 1; uint32_t matrix_height_unpadded = conv_output_h * conv_output_w; uint32_t matrix_width_unpadded = conv_input_z * R * S; - uint32_t matrix_height = (uint32_t) (ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums); - uint32_t matrix_width = (uint32_t) (ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums); + uint32_t matrix_height = (uint32_t) (std::ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t matrix_width = (uint32_t) (std::ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums); uint32_t num_groups = num_blocks_act_h * num_blocks_act_w * num_blocks_weight_w; uint32_t channel_stick_size = conv_input_z; @@ -854,7 +854,7 @@ std::pair, vector> generate_conv_activation_address_m address_map_metadata.push_back(address_map_current_group_dram_address_offset); address_map_metadata.push_back(address_map_current_group_size); // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8); + uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8); if(address_map_current_group_size_padded != address_map_current_group_size) { assert(address_map_current_group_size_padded > address_map_current_group_size); address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0); @@ -903,7 +903,7 @@ std::pair, vector> populate_address_map_vectors_for_r address_map_raw_current_group_start + current_group_size); address_map_raw_index += current_group_size; // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements) - uint32_t current_group_size_padded = (uint32_t) (ceil((double) current_group_size / (double) 8) * 8); + uint32_t current_group_size_padded = (uint32_t) (std::ceil((double) current_group_size / (double) 8) * 8); if(current_group_size_padded != current_group_size) { assert(current_group_size_padded > current_group_size); address_map.insert(address_map.end(), current_group_size_padded - current_group_size, 0); @@ -988,7 +988,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_with_address_map_single_core_( // it removes the padding done for block width but it doesn't remove padding done for tiled width uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= Wb); - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df; diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp index 6ca3178702ff..5a0a8a9814fe 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp @@ -262,7 +262,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_(const Tensor& a, cons uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp index 2e05661b4915..f46cdca2a1bb 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp @@ -278,7 +278,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_(const Tensor& uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH); assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; - uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); + uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); assert(last_block_width_datums % TILE_WIDTH == 0); diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp index 97fe8aea401d..a352d53ba7ce 100644 --- a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp +++ b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp @@ -442,7 +442,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( assert(output_channels_padded_to_tile_width <= weight_matrix_width); uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH; uint32_t num_blocks_output_w = - (uint32_t)ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); + (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums); uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums); diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp index 725fcba2499e..1f363d4c750e 100644 --- a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp +++ b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp @@ -40,7 +40,7 @@ pair, vector> compute_opt_conv_activation_as_mm_shape // pad height uint32_t num_rows = (uint32_t) batch_size * conv_output_h * conv_output_w; uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT; - uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); + uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums); uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w; uint32_t num_cols_padded = round_up(conv_activation_shape[3] * filter_w, TILE_WIDTH) * filter_h; return {{1, num_rows_padded, num_cols_padded}, {1, num_rows, num_cols}}; diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp index b5bab4f7f01d..2f2592d6f966 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.cpp @@ -74,6 +74,7 @@ void update_macro_defines(UnaryOpType op_type, std::map get_op_init_and_func_default(UnaryOpType op_type, stri op_init_and_name = {"signbit_tile_init();", fmt::format("signbit_tile({});", idst)}; break; case UnaryOpType::FLOOR: op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile({});", idst)}; break; + case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break; case UnaryOpType::SIN: op_init_and_name = {"sin_tile_init();", fmt::format("sin_tile({});", idst)}; break; case UnaryOpType::COS: op_init_and_name = {"cos_tile_init();", fmt::format("cos_tile({});", idst)}; break; case UnaryOpType::ISFINITE: @@ -371,6 +373,7 @@ inline void validate_supported_arch_dtype(tt::ARCH arch, DataType input_datatype switch (op_type) { case UnaryOpType::REMAINDER: case UnaryOpType::FLOOR: + case UnaryOpType::CEIL: case UnaryOpType::LEFT_SHIFT: case UnaryOpType::RIGHT_SHIFT: TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole"); diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp index 6226b08ffa91..8369a6ce5a86 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/eltwise_unary_op.hpp @@ -87,7 +87,8 @@ enum class UnaryOpType { FLOOR, LEFT_SHIFT, REMAINDER, - FMOD + FMOD, + CEIL }; template @@ -582,6 +583,7 @@ constexpr auto isneginf = make_eltwise_unary{}; constexpr auto isnan = make_eltwise_unary{}; constexpr auto signbit = make_eltwise_unary{}; constexpr auto floor = make_eltwise_unary{}; +constexpr auto ceil = make_eltwise_unary{}; constexpr auto atan = make_eltwise_unary{}; constexpr auto nez = make_eltwise_unary{}; constexpr auto logical_not_unary = make_eltwise_unary{}; diff --git a/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp b/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp index 15b5f6a25337..a176a863cf5b 100644 --- a/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp +++ b/tt_eager/tt_dnn/op_library/eltwise_unary/multi_core/eltwise_unary_op_sharded.cpp @@ -39,8 +39,8 @@ operation::ProgramWithCallbacks eltwise_unary_sharded(const Tensor &input, Tenso uint32_t num_tile_per_core = 0; if (input.get_dtype() == DataType::BFLOAT8_B) { - uint32_t ntiles_along_width = ceil(shard_spec.shape[1] / (float) constants::TILE_WIDTH); - uint32_t ntiles_along_height = ceil(shard_spec.shape[0] / (float) constants::TILE_HEIGHT); + uint32_t ntiles_along_width = std::ceil(shard_spec.shape[1] / (float) constants::TILE_WIDTH); + uint32_t ntiles_along_height = std::ceil(shard_spec.shape[0] / (float) constants::TILE_HEIGHT); num_tile_per_core = ntiles_along_width * ntiles_along_height; } else { TT_FATAL((shard_spec.shape[1] * datum_size(act_df)) % L1_ALIGNMENT == 0, "Shard width should be multiple of L1_ADRESS_ALIGNMENT"); diff --git a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp index 0fb889f60753..ef4cc32d7598 100644 --- a/tt_eager/tt_dnn/op_library/pool/max_pool.hpp +++ b/tt_eager/tt_dnn/op_library/pool/max_pool.hpp @@ -9,7 +9,7 @@ #include "tt_dnn/op_library/sliding_window_op_infra/sliding_window.hpp" inline uint32_t ceil_multiple_of(uint32_t n, uint32_t m) { - return (uint32_t) ceil((float) n / m) * m; + return (uint32_t) std::ceil((float) n / m) * m; } namespace tt { diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp index 8eb51c3bd9c6..f0c3a1e19189 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_xary_ops.cpp @@ -43,6 +43,7 @@ namespace tt::tt_metal::detail { ); detail::bind_unary_op(m_tensor, "signbit", signbit, R"doc(Applies the signbit function to the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "floor", floor, R"doc(Applies floor to the elements of the input tensor ``{0}``. Support provided only for Wormhole_B0.)doc"); + detail::bind_unary_op(m_tensor, "ceil", ceil, R"doc(Applies ceil to the elements of the input tensor ``{0}``. Support provided only for Wormhole_B0.)doc"); detail::bind_unary_op(m_tensor, "atan", atan, R"doc(Returns a new tensor with the arctan of the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "asin", asin, R"doc(Returns a new tensor with the arcsine of the elements of the input tensor ``{0}``.)doc"); detail::bind_unary_op(m_tensor, "acos", acos, R"doc(Returns a new tensor with the arccosine of the elements of the input tensor ``{0}``.)doc"); diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h index 58f4aad94a4c..33489e62388e 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_unary_sfpu_api.h @@ -19,6 +19,7 @@ #include "llk_math_eltwise_unary_sfpu_sign.h" #include "llk_math_eltwise_unary_sfpu_signbit.h" #include "llk_math_eltwise_unary_sfpu_floor.h" +#include "llk_math_eltwise_unary_sfpu_ceil.h" #include "llk_math_eltwise_unary_sfpu_silu.h" #include "llk_math_eltwise_unary_sfpu_square.h" #include "llk_math_eltwise_unary_sfpu_tanh.h" diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h new file mode 100644 index 000000000000..1f9b0af0a193 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" +#include "noc_nonblocking_api.h" +#include "limits.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_ceil() +{ + for (int d = 0; d < ITERATIONS; d++) + { + vFloat input = dst_reg[0]; + vFloat result; + + v_if (input <= SHRT_MIN || input > SHRT_MAX) { + result = input; + } + v_endif; + + v_if (input > SHRT_MIN && input <= SHRT_MAX) { + vInt tmp = float_to_int16(input); //TODO: Replace float_to_int16 to float_to_int32 once it is available + result = int32_to_float(tmp); + } + v_endif; + + dst_reg[0] = result; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h new file mode 100644 index 000000000000..f3db9269e6ec --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_unary_sfpu_init.h" +#include "llk_math_eltwise_unary_sfpu_params.h" +#include "ckernel_sfpu_ceil.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_unary_sfpu_ceil_init() { + llk_math_eltwise_unary_sfpu_init(); +} + +template +inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_params + (ckernel::sfpu::calculate_ceil, + dst_index, vector_mode); +} + +} diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h index 85bff7d36c0f..8aafadf0ff3b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h @@ -84,5 +84,6 @@ enum SfpuType { left_shift, remainder, fmod, + ceil, unused, }; diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h new file mode 100644 index 000000000000..f7d069c275cc --- /dev/null +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_unary_sfpu_ceil.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + + + +namespace ckernel { + +/** + * Please refer to documentation for any_init. + */ +ALWI void ceil_tile_init() { + MATH(( llk_math_eltwise_unary_sfpu_ceil_init() )); +} + +/** + * Performs ceil operation on each row of a tile. + * in DST register at index tile_index. The DST register buffer must be in + * acquired state via *acquire_dst* call. This call is blocking and is only + * available on the compute engine. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst | The index of the tile in DST register buffer to modify the sign bit of | uint32_t | Must be less than the size of the DST register buffer | True | + */ +ALWI void ceil_tile(uint32_t idst) { + MATH(( llk_math_eltwise_unary_sfpu_ceil(idst) )); +} + + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h index 3cb04a911bef..7a7f116194df 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/sfpu_split_includes.h @@ -104,6 +104,10 @@ #include "compute_kernel_api/eltwise_unary/fmod.h" #endif +#if SFPU_OP_CEIL_INCLUDE +#include "compute_kernel_api/eltwise_unary/ceil.h" +#endif + #if SFPU_OP_BINOP_WITH_SCALAR_INCLUDE #include "compute_kernel_api/eltwise_unary/binop_with_scalar.h" #endif diff --git a/ttnn/cpp/ttnn/operations/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv2d.cpp index 8d650110b55d..c6913670e957 100644 --- a/ttnn/cpp/ttnn/operations/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv2d.cpp @@ -82,7 +82,7 @@ ParallelConfig determine_parallel_config( uint32_t total_cores_for_channels = block_shard_orientation == ShardOrientation::COL_MAJOR ? device_grid_size[1] : device_grid_size[0]; uint32_t num_cores_channels = find_closest_common_largest_divisor( - conv_out_2d_matrix_width_ntiles, ceil((double)input_channels / (double)32), total_cores_for_channels); + conv_out_2d_matrix_width_ntiles, std::ceil((double)input_channels / (double)32), total_cores_for_channels); uint32_t cores_x = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_nhw : num_cores_channels; uint32_t cores_y =