#8681: Add ceil op

tenstorrent · Jul 12, 2024 · e3d0f90 · e3d0f90
1 parent dcdaad3
commit e3d0f90
Show file tree

Hide file tree

Showing 29 changed files with 187 additions and 54 deletions.
diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
@@ -458,6 +458,8 @@ Tensor elementwise operations
 
 .. autofunction:: tt_lib.tensor.floor
 
+.. autofunction:: tt_lib.tensor.ceil
+
 .. autofunction:: tt_lib.tensor.trunc
 
 .. autofunction:: tt_lib.tensor.frac

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py
@@ -632,6 +632,10 @@
         "tt_op": tt_lib_ops.eltwise_floor,
         "pytorch_op": pytorch_ops.floor,
     },
+    "eltwise-ceil": {
+        "tt_op": tt_lib_ops.eltwise_ceil,
+        "pytorch_op": pytorch_ops.ceil,
+    },
     "eltwise-trunc": {
         "tt_op": tt_lib_ops.eltwise_trunc,
         "pytorch_op": pytorch_ops.trunc,
@@ -648,7 +652,7 @@
         "tt_op": tt_lib_ops.eltwise_unary_floor_div,
         "pytorch_op": pytorch_ops.unary_floor_div,
     },
-    "eltwise-_rfloor_div": {
+    "eltwise-rfloor_div": {
         "tt_op": tt_lib_ops.eltwise_rfloor_div,
         "pytorch_op": pytorch_ops.rfloor_div,
     },

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div_trunc.py
@@ -45,9 +45,9 @@ def test_run_div_trunc(
         device,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ] + [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
         test_args.update({"output_mem_config": dst_mem_config})

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py
@@ -585,7 +585,7 @@ def test_run_eltwise_sign_ops(
             test_args,
         )
 
-    @pytest.mark.parametrize("round_off_method", ["floor", "trunc"])
+    @pytest.mark.parametrize("round_off_method", ["floor", "ceil", "trunc"])
     @skip_for_grayskull("#ToDo: GS implementation needs to be done for Floor")
     def test_run_eltwise_round_off_ops(
         self,
@@ -597,9 +597,7 @@ def test_run_eltwise_round_off_ops(
         output_mem_config,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(
-                partial(generation_funcs.gen_rand, low=-1000, high=1000), torch.bfloat16
-            )
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
         test_args.update(

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_frac.py
@@ -26,9 +26,9 @@
 @pytest.mark.parametrize(
     "input_shapes",
     [
-        [[1, 1, 32, 32], [1, 1, 32, 32]],
-        [[1, 1, 320, 384], [1, 1, 320, 384]],
-        [[1, 3, 320, 384], [1, 3, 320, 384]],
+        [[1, 1, 32, 32]],
+        [[1, 1, 320, 384]],
+        [[1, 3, 320, 384]],
     ],
 )
 @pytest.mark.parametrize(
@@ -44,7 +44,7 @@ def test_run_frac(
         device,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
         test_args.update({"output_mem_config": dst_mem_config})

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_rfloor_div.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 import random
+import numpy as np
 from functools import partial
 import tt_lib as ttl
 from tests.tt_eager.python_api_testing.sweep_tests import (
@@ -29,10 +30,6 @@
         [[1, 3, 320, 384], [1, 3, 320, 384]],
     ],
 )
-@pytest.mark.parametrize(
-    "value",
-    [-5.1, 0.0, 10.9],
-)
 @pytest.mark.parametrize(
     "dst_mem_config",
     mem_configs,
@@ -42,17 +39,17 @@ class TestRfloor_div:
     def test_run_rfloor_div(
         self,
         input_shapes,
-        value,
         dst_mem_config,
         device,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
-        test_args.update({"value": value})
+        test_args.update({"value": random.uniform(-100, 100) for _ in range(5)})
         test_args.update({"output_mem_config": dst_mem_config})
         comparison_func = comparison_funcs.comp_pcc
+
         run_single_pytorch_test(
             "eltwise-rfloor_div",
             input_shapes,

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_div_trunc.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 import random
+import numpy as np
 from functools import partial
 import tt_lib as ttl
 from tests.tt_eager.python_api_testing.sweep_tests import (
@@ -29,10 +30,6 @@
         [[1, 3, 320, 384], [1, 3, 320, 384]],
     ],
 )
-@pytest.mark.parametrize(
-    "value",
-    [-5.1, 0.0, 10.9],
-)
 @pytest.mark.parametrize(
     "dst_mem_config",
     mem_configs,
@@ -42,17 +39,17 @@ class TestUnary_Div_Trunc:
     def test_run_unary_div_trunc(
         self,
         input_shapes,
-        value,
         dst_mem_config,
         device,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
-        test_args.update({"value": value})
+        test_args.update({"value": random.uniform(-100, 100) for _ in range(5)})
         test_args.update({"output_mem_config": dst_mem_config})
         comparison_func = comparison_funcs.comp_pcc
+
         run_single_pytorch_test(
             "eltwise-unary_div_trunc",
             input_shapes,

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_unary_rdiv_trunc.py
@@ -4,6 +4,7 @@
 import pytest
 import torch
 import random
+import numpy as np
 from functools import partial
 import tt_lib as ttl
 from tests.tt_eager.python_api_testing.sweep_tests import (
@@ -29,10 +30,6 @@
         [[1, 3, 320, 384], [1, 3, 320, 384]],
     ],
 )
-@pytest.mark.parametrize(
-    "value",
-    [-5.1, 0.0, 10.9],
-)
 @pytest.mark.parametrize(
     "dst_mem_config",
     mem_configs,
@@ -42,17 +39,17 @@ class TestUnary_Rdiv_Trunc:
     def test_run_unary_rdiv_trunc(
         self,
         input_shapes,
-        value,
         dst_mem_config,
         device,
     ):
         datagen_func = [
-            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16)
+            generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16)
         ]
         test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
-        test_args.update({"value": value})
+        test_args.update({"value": random.uniform(-100, 100) for _ in range(5)})
         test_args.update({"output_mem_config": dst_mem_config})
         comparison_func = comparison_funcs.comp_pcc
+
         run_single_pytorch_test(
             "eltwise-unary_rdiv_trunc",
             input_shapes,

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -651,6 +651,10 @@ def floor(x, *args, **kwargs):
     return torch.floor(x)
 
 
+def ceil(x, *args, **kwargs):
+    return torch.ceil(x)
+
+
 def trunc(x, *args, **kwargs):
     return torch.trunc(x)
 

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
@@ -2645,6 +2645,7 @@ def unary_op(
 transpose_nw = make_unary_op(partial(ttl.tensor.transpose, dim0=0, dim1=-1))
 transpose_cw = make_unary_op(partial(ttl.tensor.transpose, dim0=1, dim1=-1))
 eltwise_floor = make_unary_op(ttl.tensor.floor)
+eltwise_ceil = make_unary_op(ttl.tensor.ceil)
 eltwise_trunc = make_unary_op(ttl.tensor.trunc)
 eltwise_frac = make_unary_op(ttl.tensor.frac)
 

diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.cpp
@@ -995,8 +995,10 @@ Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config) {
 }
 
 Tensor _frac(const Tensor& input, const MemoryConfig& output_mem_config) {
+    auto arch = input.device()->arch();
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
     Tensor trunc_res = trunc(input, output_mem_config);
-    Tensor result = sub(input, trunc_res, std::nullopt, output_mem_config);
+    Tensor result = ttnn::subtract(input, trunc_res, std::nullopt, output_mem_config);
     return result;
 }
 Tensor frac(const Tensor& input, const MemoryConfig& output_mem_config) {
@@ -1007,6 +1009,8 @@ Tensor _div_trunc(
     const Tensor& input_a,
     const Tensor& input_b,
     const MemoryConfig& output_mem_config) {
+    auto arch = input_a.device()->arch();
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
     Tensor result = div(input_a, input_b, true);
     return trunc(result);
 }
@@ -1021,6 +1025,8 @@ Tensor _div_trunc_overload(
     const Tensor& input,
     float value,
     const MemoryConfig& output_mem_config) {
+    auto arch = input.device()->arch();
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
     Tensor result = div_unary(input, value);
     return trunc(result);
 }
@@ -1035,6 +1041,8 @@ Tensor _unary_rdiv_trunc(
     float value,
     const Tensor& input,
     const MemoryConfig& output_mem_config) {
+    auto arch = input.device()->arch();
+    TT_FATAL(arch == tt::ARCH::WORMHOLE_B0, "Op is only supported on Wormhole");
     Tensor result = div_unary(value, input);
     return trunc(result);
 }

diff --git a/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp b/tt_eager/tt_dnn/op_library/composite/composite_ops.hpp
@@ -190,13 +190,11 @@ Tensor fmod(
     const Tensor& input_b,
     const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
 
-<<<<<<< HEAD
 Tensor trunc(const Tensor& input, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
-=======
+
 Tensor frac(
     const Tensor& input,
     const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
->>>>>>> #8681: Add frac op
 
 Tensor round(
     const Tensor& input,

diff --git a/tt_eager/tt_dnn/op_library/conv/conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/conv_op.cpp
@@ -34,10 +34,10 @@ pair<vector<uint32_t>, vector<uint32_t>> compute_conv_activation_as_mm_shape(Sha
     // pad height
     uint32_t num_rows = (uint32_t) conv_output_h*conv_output_w;
     uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT;
-    uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums);
+    uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums);
     uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w;
     uint32_t act_block_w_datums = act_block_w_ntiles * TILE_WIDTH;
-    uint32_t num_cols_padded = (uint32_t) (ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums);
+    uint32_t num_cols_padded = (uint32_t) (std::ceil((double) num_cols / (double) act_block_w_datums ) * act_block_w_datums);
     if(use_fast_reader) {
         assert(act_block_w_datums >= conv_activation_shape[3] * filter_w);
         num_cols_padded = act_block_w_datums * filter_h;
@@ -218,7 +218,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_single_core_(const Tensor& a,
     uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH);
     assert(output_channels_padded_to_tile_width <= weight_matrix_width);
     uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH;
-    uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
+    uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums);
     assert(last_block_width_datums % TILE_WIDTH == 0);
     uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df;
@@ -726,7 +726,7 @@ std::pair<vector<uint32_t>, vector<uint32_t>> generate_conv_weight_address_map(
         address_map_metadata.push_back(address_map_current_group_dram_address_offset);
         address_map_metadata.push_back(address_map_current_group_size);
         // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements)
-        uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8);
+        uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8);
         if(address_map_current_group_size_padded != address_map_current_group_size) {
             assert(address_map_current_group_size_padded > address_map_current_group_size);
             address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0);
@@ -764,8 +764,8 @@ std::pair<vector<uint32_t>, vector<uint32_t>> generate_conv_activation_address_m
     int conv_output_w = ((conv_input_y - S + (2 * Pad_W)) / V) + 1;
     uint32_t matrix_height_unpadded = conv_output_h * conv_output_w;
     uint32_t matrix_width_unpadded = conv_input_z * R * S;
-    uint32_t matrix_height = (uint32_t) (ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums);
-    uint32_t matrix_width = (uint32_t) (ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums);
+    uint32_t matrix_height = (uint32_t) (std::ceil((double) matrix_height_unpadded / (double) act_block_h_datums ) * act_block_h_datums);
+    uint32_t matrix_width = (uint32_t) (std::ceil((double) matrix_width_unpadded / (double) act_block_w_datums ) * act_block_w_datums);
 
     uint32_t num_groups = num_blocks_act_h * num_blocks_act_w * num_blocks_weight_w;
     uint32_t channel_stick_size = conv_input_z;
@@ -854,7 +854,7 @@ std::pair<vector<uint32_t>, vector<uint32_t>> generate_conv_activation_address_m
         address_map_metadata.push_back(address_map_current_group_dram_address_offset);
         address_map_metadata.push_back(address_map_current_group_size);
         // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements)
-        uint32_t address_map_current_group_size_padded = (uint32_t) (ceil((double) address_map_current_group_size / (double) 8) * 8);
+        uint32_t address_map_current_group_size_padded = (uint32_t) (std::ceil((double) address_map_current_group_size / (double) 8) * 8);
         if(address_map_current_group_size_padded != address_map_current_group_size) {
             assert(address_map_current_group_size_padded > address_map_current_group_size);
             address_map.insert(address_map.end(), address_map_current_group_size_padded - address_map_current_group_size, 0);
@@ -903,7 +903,7 @@ std::pair<vector<uint32_t>, vector<uint32_t>> populate_address_map_vectors_for_r
                                 address_map_raw_current_group_start + current_group_size);
         address_map_raw_index += current_group_size;
         // Pad 0s in address map buffer to ensure each read address is 32B aligned (32/sizeof(uint32_t) == 8 elements)
-        uint32_t current_group_size_padded = (uint32_t) (ceil((double) current_group_size / (double) 8) * 8);
+        uint32_t current_group_size_padded = (uint32_t) (std::ceil((double) current_group_size / (double) 8) * 8);
         if(current_group_size_padded != current_group_size) {
             assert(current_group_size_padded > current_group_size);
             address_map.insert(address_map.end(), current_group_size_padded - current_group_size, 0);
@@ -988,7 +988,7 @@ operation::ProgramWithCallbacks conv_as_large_bmm_with_address_map_single_core_(
     // it removes the padding done for block width but it doesn't remove padding done for tiled width
     uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH);
     assert(output_channels_padded_to_tile_width <= Wb);
-    uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
+    uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums);
     assert(last_block_width_datums % TILE_WIDTH == 0);
     uint32_t output_row_size_bytes = output_channels_padded_to_tile_width * num_bytes_of_df;

diff --git a/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/multi_core_optimized_conv/optimized_conv_op.cpp
@@ -262,7 +262,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_(const Tensor& a, cons
     uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH);
     assert(output_channels_padded_to_tile_width <= weight_matrix_width);
     uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH;
-    uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
+    uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums);
     assert(last_block_width_datums % TILE_WIDTH == 0);
 

diff --git a/...er/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp b/...er/tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded.cpp
@@ -278,7 +278,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_(const Tensor&
     uint32_t output_channels_padded_to_tile_width = round_up(output_channels, TILE_WIDTH);
     assert(output_channels_padded_to_tile_width <= weight_matrix_width);
     uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH;
-    uint32_t num_blocks_output_w = (uint32_t) ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
+    uint32_t num_blocks_output_w = (uint32_t) std::ceil((double) output_channels_padded_to_tile_width / (double) weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0) ? weight_block_w_datums : (output_channels_padded_to_tile_width % weight_block_w_datums);
     assert(last_block_width_datums % TILE_WIDTH == 0);
 

diff --git a/...tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp b/...tt_dnn/op_library/conv/multi_core_optimized_conv_sharded/optimized_conv_op_sharded_v2.cpp
@@ -442,7 +442,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     assert(output_channels_padded_to_tile_width <= weight_matrix_width);
     uint32_t output_width_num_tiles = output_channels_padded_to_tile_width / TILE_WIDTH;
     uint32_t num_blocks_output_w =
-        (uint32_t)ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums);
+        (uint32_t)std::ceil((double)output_channels_padded_to_tile_width / (double)weight_block_w_datums);
     uint32_t last_block_width_datums = (output_channels_padded_to_tile_width % weight_block_w_datums == 0)
                                            ? weight_block_w_datums
                                            : (output_channels_padded_to_tile_width % weight_block_w_datums);

diff --git a/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp b/tt_eager/tt_dnn/op_library/conv/optimized_conv_op.cpp
@@ -40,7 +40,7 @@ pair<vector<uint32_t>, vector<uint32_t>> compute_opt_conv_activation_as_mm_shape
     // pad height
     uint32_t num_rows = (uint32_t) batch_size * conv_output_h * conv_output_w;
     uint32_t act_block_h_datums = act_block_h_ntiles * TILE_HEIGHT;
-    uint32_t num_rows_padded = (uint32_t) (ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums);
+    uint32_t num_rows_padded = (uint32_t) (std::ceil((double) num_rows / (double) act_block_h_datums ) * act_block_h_datums);
     uint32_t num_cols = conv_activation_shape[3] * filter_h * filter_w;
     uint32_t num_cols_padded = round_up(conv_activation_shape[3] * filter_w, TILE_WIDTH) * filter_h;
     return {{1, num_rows_padded, num_cols_padded}, {1, num_rows, num_cols}};