From 1509a6e8c776471b235042b2070a2ee1930c3aaa Mon Sep 17 00:00:00 2001
From: Mouliraj Elamurugan <mcw-melamurugan@ext.tenstorrent.com>
Date: Thu, 12 Dec 2024 16:22:00 +0530
Subject: [PATCH] #15647:Update ceil op (#15657)

### Ticket
Link to Github Issue #15647

### Problem description

- Ceil op ignore values outside range of i16

### What's changed

- Updated the logic to support fp32

### Profiling Results : Shape used [1, 1, 102400, 32]

Kernel Duration [ns]
- Bfloat16 : 70841
- Float32 : 153649

### Checklist
- [ ] All Post commit CI
---
 .../operations/eltwise/test_unary.py          | 19 +++++++++++++
 .../llk_api/llk_sfpu/ckernel_sfpu_ceil.h      | 17 +++++++++++-
 .../llk_math_eltwise_unary_sfpu_ceil.h        |  5 ++++
 .../llk_api/llk_sfpu/ckernel_sfpu_ceil.h      | 16 ++++++++++-
 .../llk_math_eltwise_unary_sfpu_ceil.h        |  5 ++++
 .../compute_kernel_api/eltwise_unary/ceil.h   | 18 ++++++++++++-
 .../eltwise/unary/common/unary_op_types.hpp   |  1 +
 .../eltwise/unary/common/unary_op_utils.cpp   | 10 ++++---
 .../ttnn/operations/eltwise/unary/unary.cpp   | 27 ++++++++++++++++++-
 .../ttnn/operations/eltwise/unary/unary.hpp   | 14 +++++++++-
 10 files changed, 124 insertions(+), 8 deletions(-)
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_unary.py b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py
index da305202e9c..805d73ca5c9 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_unary.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py
@@ -447,3 +447,22 @@ def test_unary_floor(input_shapes, device):
     golden_tensor = golden_function(in_data1)
     output_tensor = ttnn.to_torch(output_tensor)
     assert_with_pcc(golden_tensor, output_tensor, 0.999)
+
+
+@skip_for_grayskull()
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 1, 32, 32])),
+        (torch.Size([1, 1, 320, 384])),
+        (torch.Size([1, 3, 320, 384])),
+    ),
+)
+def test_unary_ceil(input_shapes, device):
+    in_data1 = torch.empty(input_shapes, dtype=torch.float32).uniform_(-43566, 43565)
+    input_tensor1 = ttnn.from_torch(in_data1, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device)
+    output_tensor = ttnn.ceil(input_tensor1)
+    golden_function = ttnn.get_golden_function(ttnn.ceil)
+    golden_tensor = golden_function(in_data1)
+    output_tensor = ttnn.to_torch(output_tensor)
+    assert_with_pcc(golden_tensor, output_tensor, 0.999)
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
index d803f841dbf..c34da7ec973 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
@@ -9,6 +9,7 @@
 #include "sfpi.h"
 #include "noc_nonblocking_api.h"
 #include "limits.h"
+#include "ckernel_sfpu_floor.h"
 
 using namespace sfpi;
 
@@ -20,7 +21,7 @@ inline void calculate_ceil() {
     for (int d = 0; d < ITERATIONS; d++) {
         vFloat result = dst_reg[0];
         vFloat v = result;
-        vInt tmp = float_to_int16(result, 0);  // TODO: Replace float_to_int16 to float_to_int32 once it is available
+        vInt tmp = float_to_int16(result, 0);
         result = int32_to_float(tmp, 0);
         v_if(result < v) { result = result + 1; }
         v_endif;
@@ -31,5 +32,19 @@ inline void calculate_ceil() {
     }
 }
 
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_ceil_float32() {
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat result = dst_reg[0];
+        vFloat v = result;
+        vInt tmp = float_to_int32(result);
+        result = int32_to_float(tmp, 0);
+        v_if(result < v) { result = result + 1; }
+        v_endif;
+        dst_reg[0] = result;
+        dst_reg++;
+    }
+}
+
 }  // namespace sfpu
 }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
index 690528ed0c4..e328d869eb8 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
@@ -22,4 +22,9 @@ inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = (
     llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(ckernel::sfpu::calculate_ceil<APPROXIMATE>, dst_index, vector_mode);
 }
 
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_ceil_float32(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_ceil_float32<APPROXIMATE>, dst_index, vector_mode);
+}
 }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
index d803f841dbf..983faf8db6f 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_ceil.h
@@ -9,6 +9,7 @@
 #include "sfpi.h"
 #include "noc_nonblocking_api.h"
 #include "limits.h"
+#include "ckernel_sfpu_floor.h"
 
 using namespace sfpi;
 
@@ -20,7 +21,7 @@ inline void calculate_ceil() {
     for (int d = 0; d < ITERATIONS; d++) {
         vFloat result = dst_reg[0];
         vFloat v = result;
-        vInt tmp = float_to_int16(result, 0);  // TODO: Replace float_to_int16 to float_to_int32 once it is available
+        vInt tmp = float_to_int16(result, 0);
         result = int32_to_float(tmp, 0);
         v_if(result < v) { result = result + 1; }
         v_endif;
@@ -31,5 +32,18 @@ inline void calculate_ceil() {
     }
 }
 
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_ceil_float32() {
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat result = dst_reg[0];
+        vFloat v = result;
+        vInt tmp = float_to_int32(result);
+        result = int32_to_float(tmp, 0);
+        v_if(result < v) { result = result + 1; }
+        v_endif;
+        dst_reg[0] = result;
+        dst_reg++;
+    }
+}
 }  // namespace sfpu
 }  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
index 690528ed0c4..e328d869eb8 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_ceil.h
@@ -22,4 +22,9 @@ inline void llk_math_eltwise_unary_sfpu_ceil(uint dst_index, int vector_mode = (
     llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(ckernel::sfpu::calculate_ceil<APPROXIMATE>, dst_index, vector_mode);
 }
 
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_ceil_float32(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_ceil_float32<APPROXIMATE>, dst_index, vector_mode);
+}
 }  // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h
index bc2f2e7863b..3d8d27724fd 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/ceil.h
@@ -31,9 +31,25 @@ ALWI void ceil_tile_init() { MATH((llk_math_eltwise_unary_sfpu_ceil_init<APPROX>
  * | Argument        | Description                                                                | Type     | Valid
  * Range                                           | Required |
  * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
- * | idst            | The index of the tile in DST register buffer to modify the sign bit of     | uint32_t | Must be
+ * | idst            | The index of the tile in DST register buffer to perform ceil operation    | uint32_t | Must be
  * less than the size of the DST register buffer | True     |
  */
 ALWI void ceil_tile(uint32_t idst) { MATH((llk_math_eltwise_unary_sfpu_ceil<APPROX>(idst))); }
 
+/**
+ * Performs ceil operation on each row of a tile.
+ * in DST register at index tile_index. The DST register buffer must be in
+ * acquired state via *acquire_dst* call. This call is blocking and is only
+ * available on the compute engine.
+ *
+ * Return value: None
+ *
+ * | Argument        | Description                                                                | Type     | Valid
+ * Range                                           | Required |
+ * |-----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
+ * | idst            | The index of the tile in DST register buffer to perform ceil operation     | uint32_t | Must be
+ * less than the size of the DST register buffer | True     |
+ */
+ALWI void ceil_tile_float32(uint32_t idst) { MATH((llk_math_eltwise_unary_sfpu_ceil_float32<APPROX>(idst))); }
+
 }  // namespace ckernel
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
index da0ef55c936..ec83abec1cf 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
@@ -82,6 +82,7 @@ enum class UnaryOpType {
     FLOOR,
     FLOOR_FLOAT32,
     CEIL,
+    CEIL_FLOAT32,
     LEFT_SHIFT,
     REMAINDER,
     FMOD,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
index 0732e967602..4ed08212b53 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
@@ -51,6 +51,8 @@ void update_macro_defines(UnaryOpType op_type, std::map<std::string, std::string
         case UnaryOpType::IDENTITY_UINT32: defines["SFPU_OP_IDENTITY_INCLUDE"] = "1"; break;
         case UnaryOpType::FLOOR:
         case UnaryOpType::FLOOR_FLOAT32: defines["SFPU_OP_FLOOR_INCLUDE"] = "1"; break;
+        case UnaryOpType::CEIL:
+        case UnaryOpType::CEIL_FLOAT32: defines["SFPU_OP_CEIL_INCLUDE"] = "1"; break;
         case UnaryOpType::RDIV: break;
         case UnaryOpType::RSUB: defines["SFPU_OP_REVERSE_FAMILY_INCLUDE"] = "1";
         case UnaryOpType::ISINF:
@@ -73,7 +75,6 @@ void update_macro_defines(UnaryOpType op_type, std::map<std::string, std::string
         case UnaryOpType::BITWISE_AND: defines["SFPU_OP_BITWISE_AND_INCLUDE"] = "1"; break;
         case UnaryOpType::BITWISE_OR: defines["SFPU_OP_BITWISE_OR_INCLUDE"] = "1"; break;
         case UnaryOpType::RIGHT_SHIFT: defines["SFPU_OP_RIGHT_SHIFT_INCLUDE"] = "1"; break;
-        case UnaryOpType::CEIL: defines["SFPU_OP_CEIL_INCLUDE"] = "1"; break;
         case UnaryOpType::LEFT_SHIFT: defines["SFPU_OP_LEFT_SHIFT_INCLUDE"] = "1"; break;
         case UnaryOpType::REMAINDER: defines["SFPU_OP_REMAINDER_INCLUDE"] = "1"; break;
         case UnaryOpType::FMOD: defines["SFPU_OP_FMOD_INCLUDE"] = "1"; break;
@@ -282,7 +283,6 @@ std::pair<string, string> get_op_init_and_func_default(UnaryOpType op_type, std:
         case UnaryOpType::SIGNBIT:
             op_init_and_name = {"signbit_tile_init();", fmt::format("signbit_tile({});", idst)};
             break;
-        case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break;
         case UnaryOpType::SIN: op_init_and_name = {"sin_tile_init();", fmt::format("sin_tile({});", idst)}; break;
         case UnaryOpType::COS: op_init_and_name = {"cos_tile_init();", fmt::format("cos_tile({});", idst)}; break;
         case UnaryOpType::ISFINITE:
@@ -344,7 +344,11 @@ std::pair<string, string> get_op_init_and_func_default(UnaryOpType op_type, std:
             op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile({});", idst)};
             break;
         case UnaryOpType::FLOOR_FLOAT32:
-            op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile_float32({});", idst)}; break;
+            op_init_and_name = {"floor_tile_init();", fmt::format("floor_tile_float32({});", idst)};
+            break;
+        case UnaryOpType::CEIL: op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile({});", idst)}; break;
+        case UnaryOpType::CEIL_FLOAT32:
+            op_init_and_name = {"ceil_tile_init();", fmt::format("ceil_tile_float32({});", idst)};
             break;
         case UnaryOpType::RELU6:
             op_init_and_name = {"relu_max_tile_init();", fmt::format("relu_max_tile({}, 0x40c00000u);", idst)};
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
index c87dae81384..ec50c8ce692 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.cpp
@@ -99,7 +99,6 @@ template struct ExecuteUnary<UnaryOpType::ERFINV>;
 template struct ExecuteUnary<UnaryOpType::EXP2>;
 template struct ExecuteUnary<UnaryOpType::EXPM1>;
 template struct ExecuteUnary<UnaryOpType::EQZ>;
-template struct ExecuteUnary<UnaryOpType::CEIL>;
 template struct ExecuteUnary<UnaryOpType::GEZ>;
 template struct ExecuteUnary<UnaryOpType::GTZ>;
 template struct ExecuteUnary<UnaryOpType::I0>;
@@ -362,6 +361,32 @@ Tensor Floor::invoke(
         DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor);
 }
 
+Tensor Ceil::invoke(
+    uint8_t queue_id,
+    const Tensor& input_tensor,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<Tensor>& optional_output_tensor) {
+    UnaryOpType op_type = UnaryOpType::CEIL;
+    if (input_tensor.get_dtype() == DataType::FLOAT32) {
+        op_type = UnaryOpType::CEIL_FLOAT32;
+    }
+
+    return detail::unary_impl(queue_id, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor);
+}
+
+Tensor Ceil::invoke(
+    const Tensor& input_tensor,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<Tensor>& optional_output_tensor) {
+    UnaryOpType op_type = UnaryOpType::CEIL;
+    if (input_tensor.get_dtype() == DataType::FLOAT32) {
+        op_type = UnaryOpType::CEIL_FLOAT32;
+    }
+
+    return detail::unary_impl(
+        DefaultQueueId, input_tensor, {UnaryWithParam{op_type}}, memory_config, optional_output_tensor);
+}
+
 Tensor Dropout::invoke(
     const Tensor& input,
     const uint32_t seed,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
index a5a8d89087d..d778f5a7bdf 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp
@@ -161,6 +161,18 @@ struct Floor {
         const std::optional<Tensor>& optional_output_tensor = std::nullopt);
 };
 
+struct Ceil {
+    static Tensor invoke(
+        uint8_t queue_id,
+        const Tensor& input_tensor,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt,
+        const std::optional<Tensor>& optional_output_tensor = std::nullopt);
+
+    static Tensor invoke(
+        const Tensor& input_tensor,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt,
+        const std::optional<Tensor>& optional_output_tensor = std::nullopt);
+};
 struct Dropout {
     static Tensor invoke(
         const Tensor& input,
@@ -294,7 +306,6 @@ REGISTER_UNARY_OPERATION(erfinv, ERFINV);
 REGISTER_UNARY_OPERATION(exp2, EXP2);
 REGISTER_UNARY_OPERATION(expm1, EXPM1);
 REGISTER_UNARY_OPERATION(eqz, EQZ);
-REGISTER_UNARY_OPERATION(ceil, CEIL);
 REGISTER_UNARY_OPERATION(gez, GEZ);
 REGISTER_UNARY_OPERATION(gtz, GTZ);
 REGISTER_UNARY_OPERATION(i0, I0);
@@ -368,6 +379,7 @@ constexpr auto identity =
     ttnn::register_operation_with_auto_launch_op<"ttnn::identity", ttnn::operations::unary::Identity>();
 constexpr auto floor =
     ttnn::register_operation_with_auto_launch_op<"ttnn::floor", ttnn::operations::unary::Floor>();
+constexpr auto ceil = ttnn::register_operation_with_auto_launch_op<"ttnn::ceil", ttnn::operations::unary::Ceil>();
 constexpr auto softplus =
     ttnn::register_operation_with_auto_launch_op<"ttnn::softplus", ttnn::operations::unary::Softplus>();
 constexpr auto prelu_sfpu =