From 1509a6e8c776471b235042b2070a2ee1930c3aaa Mon Sep 17 00:00:00 2001 From: Mouliraj Elamurugan Date: Thu, 12 Dec 2024 16:22:00 +0530 Subject: [PATCH] #15647:Update ceil op (#15657) ### Ticket Link to Github Issue #15647 ### Problem description - Ceil op ignore values outside range of i16 ### What's changed - Updated the logic to support fp32 ### Profiling Results : Shape used [1, 1, 102400, 32] Kernel Duration [ns] - Bfloat16 : 70841 - Float32 : 153649 ### Checklist - [ ] All Post commit CI --- .../operations/eltwise/test_unary.py | 19 +++++++++++++ .../llk_api/llk_sfpu/ckernel_sfpu_ceil.h | 17 +++++++++++- .../llk_math_eltwise_unary_sfpu_ceil.h | 5 ++++ .../llk_api/llk_sfpu/ckernel_sfpu_ceil.h | 16 ++++++++++- .../llk_math_eltwise_unary_sfpu_ceil.h | 5 ++++ .../compute_kernel_api/eltwise_unary/ceil.h | 18 ++++++++++++- .../eltwise/unary/common/unary_op_types.hpp | 1 + .../eltwise/unary/common/unary_op_utils.cpp | 10 ++++--- .../ttnn/operations/eltwise/unary/unary.cpp | 27 ++++++++++++++++++- .../ttnn/operations/eltwise/unary/unary.hpp | 14 +++++++++- 10 files changed, 124 insertions(+), 8 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_unary.py b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py index da305202e9c..805d73ca5c9 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_unary.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py @@ -447,3 +447,22 @@ def test_unary_floor(input_shapes, device): golden_tensor = golden_function(in_data1) output_tensor = ttnn.to_torch(output_tensor) assert_with_pcc(golden_tensor, output_tensor, 0.999) + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + ( + (torch.Size([1, 1, 32, 32])), + (torch.Size([1, 1, 320, 384])), + (torch.Size([1, 3, 320, 384])), + ), +) +def test_unary_ceil(input_shapes, device): + in_data1 = torch.empty(input_shapes, dtype=torch.float32).uniform_(-43566, 43565) + input_tensor1 = ttnn.from_torch(in_data1, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + output_tensor = ttnn.ceil(input_tensor1) + golden_function = ttnn.get_golden_function(ttnn.ceil) + golden_tensor = golden_function(in_data1) + output_tensor = ttnn.to_torch(output_tensor) + assert_with_pcc(golden_tensor, output_tensor, 0.999) diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h index d803f841dbf..c34da7ec973 100644 --- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h @@ -9,6 +9,7 @@ #include "sfpi.h" #include "noc_nonblocking_api.h" #include "limits.h" +#include "ckernel_sfpu_floor.h" using namespace sfpi; @@ -20,7 +21,7 @@ inline void calculate_ceil() { for (int d = 0; d < ITERATIONS; d++) { vFloat result = dst_reg[0]; vFloat v = result; - vInt tmp = float_to_int16(result, 0); // TODO: Replace float_to_int16 to float_to_int32 once it is available + vInt tmp = float_to_int16(result, 0); result = int32_to_float(tmp, 0); v_if(result < v) { result = result + 1; } v_endif; @@ -31,5 +32,19 @@ inline void calculate_ceil() { } } +template +inline void calculate_ceil_float32() { + for (int d = 0; d < ITERATIONS; d++) { + vFloat result = dst_reg[0]; + vFloat v = result; + vInt tmp = float_to_int32(result); + result = int32_to_float(tmp, 0); + v_if(result < v) { result = result + 1; } + v_endif; + dst_reg[0] = result; + dst_reg++; + } +} + } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h index 690528ed0c4..e328d869eb8 100644 --- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h @@ -22,4 +22,9 @@ inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = ( llk_math_eltwise_unary_sfpu_params(ckernel::sfpu::calculate_ceil, dst_index, vector_mode); } +template +inline void llk_math_eltwise_unary_sfpu_ceil_float32(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_params( + ckernel::sfpu::calculate_ceil_float32, dst_index, vector_mode); +} } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h index d803f841dbf..983faf8db6f 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h @@ -9,6 +9,7 @@ #include "sfpi.h" #include "noc_nonblocking_api.h" #include "limits.h" +#include "ckernel_sfpu_floor.h" using namespace sfpi; @@ -20,7 +21,7 @@ inline void calculate_ceil() { for (int d = 0; d < ITERATIONS; d++) { vFloat result = dst_reg[0]; vFloat v = result; - vInt tmp = float_to_int16(result, 0); // TODO: Replace float_to_int16 to float_to_int32 once it is available + vInt tmp = float_to_int16(result, 0); result = int32_to_float(tmp, 0); v_if(result < v) { result = result + 1; } v_endif; @@ -31,5 +32,18 @@ inline void calculate_ceil() { } } +template +inline void calculate_ceil_float32() { + for (int d = 0; d < ITERATIONS; d++) { + vFloat result = dst_reg[0]; + vFloat v = result; + vInt tmp = float_to_int32(result); + result = int32_to_float(tmp, 0); + v_if(result < v) { result = result + 1; } + v_endif; + dst_reg[0] = result; + dst_reg++; + } +} } // namespace sfpu } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h index 690528ed0c4..e328d869eb8 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h @@ -22,4 +22,9 @@ inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = ( llk_math_eltwise_unary_sfpu_params(ckernel::sfpu::calculate_ceil, dst_index, vector_mode); } +template +inline void llk_math_eltwise_unary_sfpu_ceil_float32(uint dst_index, int vector_mode = (int)VectorMode::RC) { + llk_math_eltwise_unary_sfpu_params( + ckernel::sfpu::calculate_ceil_float32, dst_index, vector_mode); +} } // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h index bc2f2e7863b..3d8d27724fd 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h @@ -31,9 +31,25 @@ ALWI void ceil_tile_init() { MATH((llk_math_eltwise_unary_sfpu_ceil_init * | Argument | Description | Type | Valid * Range | Required | * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| - * | idst | The index of the tile in DST register buffer to modify the sign bit of | uint32_t | Must be + * | idst | The index of the tile in DST register buffer to perform ceil operation | uint32_t | Must be * less than the size of the DST register buffer | True | */ ALWI void ceil_tile(uint32_t idst) { MATH((llk_math_eltwise_unary_sfpu_ceil(idst))); } +/** + * Performs ceil operation on each row of a tile. + * in DST register at index tile_index. The DST register buffer must be in + * acquired state via *acquire_dst* call. This call is blocking and is only + * available on the compute engine. + * + * Return value: None + * + * | Argument | Description | Type | Valid + * Range | Required | + * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst | The index of the tile in DST register buffer to perform ceil operation | uint32_t | Must be + * less than the size of the DST register buffer | True | + */ +ALWI void ceil_tile_float32(uint32_t idst) { MATH((llk_math_eltwise_unary_sfpu_ceil_float32(idst))); } + } // namespace ckernel diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp index da0ef55c936..ec83abec1cf 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp @@ -82,6 +82,7 @@ enum class UnaryOpType { FLOOR, FLOOR_FLOAT32, CEIL, + CEIL_FLOAT32, LEFT_SHIFT, REMAINDER, FMOD, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp index 0732e967602..4ed08212b53 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp @@ -51,6 +51,8 @@ void update_macro_defines(UnaryOpType op_type, std::map get_op_init_and_func_default(UnaryOpType op_type, std: case UnaryOpType::SIGNBIT: op_init_and_name = {"signbit_tile_init();", fmt::format("signbit_tile({});", idst)}; break; - case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break; case UnaryOpType::SIN: op_init_and_name = {"sin_tile_init();", fmt::format("sin_tile({});", idst)}; break; case UnaryOpType::COS: op_init_and_name = {"cos_tile_init();", fmt::format("cos_tile({});", idst)}; break; case UnaryOpType::ISFINITE: @@ -344,7 +344,11 @@ std::pair get_op_init_and_func_default(UnaryOpType op_type, std: op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile({});", idst)}; break; case UnaryOpType::FLOOR_FLOAT32: - op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile_float32({});", idst)}; break; + op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile_float32({});", idst)}; + break; + case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break; + case UnaryOpType::CEIL_FLOAT32: + op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile_float32({});", idst)}; break; case UnaryOpType::RELU6: op_init_and_name = {"relu_max_tile_init();", fmt::format("relu_max_tile({}, 0x40c00000u);", idst)}; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp index c87dae81384..ec50c8ce692 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp @@ -99,7 +99,6 @@ template struct ExecuteUnary; template struct ExecuteUnary; template struct ExecuteUnary; template struct ExecuteUnary; -template struct ExecuteUnary; template struct ExecuteUnary; template struct ExecuteUnary; template struct ExecuteUnary; @@ -362,6 +361,32 @@ Tensor Floor::invoke( DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); } +Tensor Ceil::invoke( + uint8_t queue_id, + const Tensor& input_tensor, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + UnaryOpType op_type = UnaryOpType::CEIL; + if (input_tensor.get_dtype() == DataType::FLOAT32) { + op_type = UnaryOpType::CEIL_FLOAT32; + } + + return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); +} + +Tensor Ceil::invoke( + const Tensor& input_tensor, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + UnaryOpType op_type = UnaryOpType::CEIL; + if (input_tensor.get_dtype() == DataType::FLOAT32) { + op_type = UnaryOpType::CEIL_FLOAT32; + } + + return detail::unary_impl( + DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor); +} + Tensor Dropout::invoke( const Tensor& input, const uint32_t seed, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp index a5a8d89087d..d778f5a7bdf 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp @@ -161,6 +161,18 @@ struct Floor { const std::optional& optional_output_tensor = std::nullopt); }; +struct Ceil { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); +}; struct Dropout { static Tensor invoke( const Tensor& input, @@ -294,7 +306,6 @@ REGISTER_UNARY_OPERATION(erfinv, ERFINV); REGISTER_UNARY_OPERATION(exp2, EXP2); REGISTER_UNARY_OPERATION(expm1, EXPM1); REGISTER_UNARY_OPERATION(eqz, EQZ); -REGISTER_UNARY_OPERATION(ceil, CEIL); REGISTER_UNARY_OPERATION(gez, GEZ); REGISTER_UNARY_OPERATION(gtz, GTZ); REGISTER_UNARY_OPERATION(i0, I0); @@ -368,6 +379,7 @@ constexpr auto identity = ttnn::register_operation_with_auto_launch_op<"ttnn::identity", ttnn::operations::unary::Identity>(); constexpr auto floor = ttnn::register_operation_with_auto_launch_op<"ttnn::floor", ttnn::operations::unary::Floor>(); +constexpr auto ceil = ttnn::register_operation_with_auto_launch_op<"ttnn::ceil", ttnn::operations::unary::Ceil>(); constexpr auto softplus = ttnn::register_operation_with_auto_launch_op<"ttnn::softplus", ttnn::operations::unary::Softplus>(); constexpr auto prelu_sfpu =