From 63ffc11493c5196971d795f56b0baa19b347edcd Mon Sep 17 00:00:00 2001
From: Naif Tarafdar <135640067+tarafdarTT@users.noreply.github.com>
Date: Sat, 3 Aug 2024 12:31:15 -0400
Subject: [PATCH] Split Data movement Ops Program factory into CPP and HPP
 (#11067)

#0: split data movement ops into hpp and cpp
---
 ttnn/CMakeLists.txt                           |    5 +
 .../downsample/device/downsample_op.cpp       |  862 +------------
 .../device/downsample_program_factory.cpp     |  877 +++++++++++++
 .../device/downsample_program_factory.hpp     |   18 +
 .../pad/device/pad_program_factory.cpp        | 1127 +++++++++++++++++
 .../pad/device/pad_program_factory.hpp        | 1095 +---------------
 .../data_movement/slice/device/slice_op.cpp   |    2 +-
 .../slice/device/slice_program_factory.cpp    |  545 ++++++++
 .../slice/device/slice_program_factory.hpp    |  541 +-------
 .../tilize/device/tilize_program_factory.cpp  |  411 ++++++
 .../tilize/device/tilize_program_factory.hpp  |  400 +-----
 ...ilize_with_val_padding_program_factory.cpp |  491 +++++++
 ...ilize_with_val_padding_program_factory.hpp |  475 +------
 13 files changed, 3494 insertions(+), 3355 deletions(-)
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.hpp
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
 create mode 100644 ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp

diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index 6886f1b8a61..3de0178fac3 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -63,6 +63,7 @@ set(TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/embedding/device/embedding_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/concat/concat.cpp
@@ -71,11 +72,15 @@ set(TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/device/pad_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/downsample/device/downsample_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/device/transformer_device_operation.cpp
diff --git a/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_op.cpp
index 1582000ab46..50b22c15332 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_op.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "downsample_op.hpp"
+#include "downsample_program_factory.hpp"
 
 #include <math.h>
 
@@ -15,11 +16,7 @@
 
 using namespace tt::constants;
 
-namespace ttnn {
-
-namespace operations {
-
-namespace data_movement {
+namespace ttnn::operations::data_movement {
 
 void Downsample::validate(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
@@ -34,25 +31,7 @@ void Downsample::validate(const std::vector<Tensor>& input_tensors) const {
         input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
 }
 
-std::pair<uint32_t, uint32_t> get_num_cores_height_width_sliced(
-    CoreRangeSet all_cores, TensorMemoryLayout memory_layout, ShardOrientation shard_orientation) {
-    TT_ASSERT(
-        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
-    if (memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
-        TT_ASSERT(shard_orientation == ShardOrientation::ROW_MAJOR);
-    } else {
-        TT_ASSERT(shard_orientation == ShardOrientation::COL_MAJOR);
-        TT_ASSERT(all_cores.ranges().size() == 1);
-    }
-    uint32_t num_cores = all_cores.num_cores();
-    auto first_core_range = *all_cores.ranges().begin();
-    uint32_t num_cores_height_sliced =
-        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED ? num_cores : first_core_range.end_coord.x + 1;
-    uint32_t num_cores_width_sliced = memory_layout == TensorMemoryLayout::HEIGHT_SHARDED
-                                          ? 1
-                                          : first_core_range.end_coord.y + 1;  // width is not sliced when height sharded
-    return {num_cores_height_sliced, num_cores_width_sliced};
-}
+
 
 std::vector<tt::tt_metal::Shape> Downsample::compute_output_shapes(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
@@ -74,7 +53,7 @@ std::vector<tt::tt_metal::Shape> Downsample::compute_output_shapes(const std::ve
 std::vector<Tensor> Downsample::create_output_tensors(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor = input_tensors.at(0);
     auto output_shape = this->compute_output_shapes(input_tensors).at(0);
-    auto [num_cores_height_sliced, num_cores_width_sliced] = get_num_cores_height_width_sliced(
+    auto [num_cores_height_sliced, num_cores_width_sliced] = detail::get_num_cores_height_width_sliced(
         input_tensor.shard_spec().value().grid,
         input_tensor.memory_config().memory_layout,
         input_tensor.shard_spec().value().orientation);
@@ -94,7 +73,7 @@ operation::ProgramWithCallbacks Downsample::create_program(
     const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
     auto& output_tensor = output_tensors.at(0);
-    return {downsample_single_core(input_tensor_a, downsample_params, output_tensor)};
+    return {detail::downsample_single_core(input_tensor_a, downsample_params, output_tensor)};
 }
 
 Tensor downsample(
@@ -104,840 +83,11 @@ Tensor downsample(
         return output_tensors.at(0);
 }
 
-struct DownsampleReadPatternParams {
-    uint32_t top_partial_middle_aligned_row_width;
-    uint32_t skip_top_partial_middle_aligned_row;
-    uint32_t top_partial_right_aligned_row_width;
-    uint32_t skip_top_partial_right_aligned_row;
-    uint32_t num_rows_top_partial_image;
-    uint32_t num_skip_rows_top_partial_image;
-    uint32_t num_full_images;
-    uint32_t num_rows_bottom_partial_image;
-    uint32_t num_skip_rows_bottom_partial_image;
-    uint32_t bottom_partial_left_aligned_row_width;
-    uint32_t skip_bottom_partial_left_aligned_row;
-};
-
-struct ImgTrackingVars {
-    uint32_t img_h = 0;
-    uint32_t img_w = 0;
-    uint32_t next_img_h = 0;  // img_h after stride
-    uint32_t next_img_w = 0;
-    uint32_t input_flat_h = 0;   // index within sharded input
-    uint32_t output_flat_h = 0;  // index within sharded output
-};
-
-DownsampleReadPatternParams generate_downsample_read_pattern(
-    ImgTrackingVars& v,
-    uint32_t img_height,
-    uint32_t img_width,
-    uint32_t img_stride_h,
-    uint32_t img_stride_w,
-    uint32_t input_end_flat_h,
-    uint32_t output_end_flat_h,
-    bool current_region_is_halo_prev_core,
-    bool current_region_is_halo_next_core) {
-    // Sanity checks at the start for local data
-    TT_ASSERT(v.next_img_h >= v.img_h);
-    TT_ASSERT(v.next_img_w == v.img_w);  // assumption that the start is picked and not skipped by stride
-    TT_ASSERT(v.img_h < img_height);
-    TT_ASSERT(v.next_img_w < img_width);
-    if (current_region_is_halo_prev_core) {
-        // cout << "GENERATING READ PATTERN FOR HALO REGION FROM PREVIOUS CORE" << endl;
-        TT_ASSERT(!current_region_is_halo_next_core);
-        TT_ASSERT(v.input_flat_h != 0);
-        TT_ASSERT(v.output_flat_h == 0);
-    } else if (current_region_is_halo_next_core) {
-        // cout << "GENERATING READ PATTERN FOR HALO REGION FROM NEXT CORE" << endl;
-        TT_ASSERT(!current_region_is_halo_prev_core);
-        TT_ASSERT(v.input_flat_h == 0);
-        TT_ASSERT(v.output_flat_h != 0);
-    } else {
-        // cout << "GENERATING READ PATTERN FOR LOCAL REGION" << endl;
-    }
-
-    // cout << "img_h=" << v.img_h << ", img_w=" << v.img_w << ", next_img_h=" << v.next_img_h << ", next_img_w=" <<
-    // v.img_w << endl; cout << "v.input_flat_h=" << v.input_flat_h << ", input_end_flat_h=" << input_end_flat_h << ",
-    // v.output_flat_h=" << v.output_flat_h << ", output_end_flat_h=" << output_end_flat_h << endl;
-
-    TT_ASSERT(v.input_flat_h < input_end_flat_h);
-    TT_ASSERT(v.output_flat_h < output_end_flat_h);
-
-    uint32_t output_img_height = std::ceil((double)img_height / (double)img_stride_h);
-    uint32_t output_img_width = std::ceil((double)img_width / (double)img_stride_w);
-    bool found_halo_for_next_core = false;
-
-    uint32_t top_partial_middle_aligned_row_width = 0;
-    uint32_t skip_top_partial_middle_aligned_row = 1;
-    uint32_t top_partial_right_aligned_row_width = 0;
-    uint32_t skip_top_partial_right_aligned_row = 1;
-    uint32_t num_rows_top_partial_image = 0;
-    uint32_t num_skip_rows_top_partial_image = 0;
-    uint32_t num_full_images = 0;
-    uint32_t num_rows_bottom_partial_image = 0;
-    uint32_t num_skip_rows_bottom_partial_image = 0;
-    uint32_t bottom_partial_left_aligned_row_width = 0;
-    uint32_t skip_bottom_partial_left_aligned_row = 1;
-    if (v.img_w != 0) {
-        // Check if its right aligned or middle aligned (special corner case for halo)
-        if (v.input_flat_h + img_width - v.img_w <= input_end_flat_h + 1) {
-            // top partial right aligned
-            top_partial_right_aligned_row_width = img_width - v.img_w;
-            skip_top_partial_right_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
-            v.input_flat_h += top_partial_right_aligned_row_width;
-            if (!skip_top_partial_right_aligned_row) {
-                v.output_flat_h += std::ceil((double)top_partial_right_aligned_row_width / (double)img_stride_w);
-                TT_ASSERT(v.output_flat_h <= output_end_flat_h);
-            }
-            v.img_w = 0;
-            v.next_img_w = 0;
-            if (v.img_h == img_height - 1) {
-                v.img_h = 0;
-                v.next_img_h = 0;
-            } else {
-                v.img_h += 1;
-                if (v.next_img_h < v.img_h) {
-                    v.next_img_h += img_stride_h;
-                }
-            }
-        } else {
-            // special corner case for halo region
-            // middle aligned
-            TT_ASSERT(input_end_flat_h - v.input_flat_h + 1 < img_width);
-            TT_ASSERT(current_region_is_halo_prev_core || current_region_is_halo_next_core);
-            // top partial middle aligned
-            top_partial_middle_aligned_row_width = input_end_flat_h - v.input_flat_h + 1;
-            skip_top_partial_middle_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
-            v.input_flat_h += top_partial_middle_aligned_row_width;
-            if (!skip_top_partial_middle_aligned_row) {
-                v.output_flat_h += std::ceil((double)top_partial_middle_aligned_row_width / (double)img_stride_w);
-                TT_ASSERT(v.output_flat_h <= output_end_flat_h);
-            }
-            uint32_t img_w_start = v.img_w;
-            while (v.img_w < img_w_start + top_partial_middle_aligned_row_width) {
-                v.img_w += 1;
-                if (v.next_img_w < v.img_w) {
-                    v.next_img_w += img_stride_w;
-                }
-            }
-            TT_ASSERT(v.img_w < img_width - 1);
-        }
-    }
-    TT_ASSERT(v.next_img_w == v.img_w);
-    TT_ASSERT(v.output_flat_h <= output_end_flat_h);
-    TT_ASSERT(v.next_img_h >= v.img_h);
-    if (v.img_w != 0) {
-        // special case for halo
-        TT_ASSERT(v.input_flat_h == input_end_flat_h + 1);
-    }
-    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
-
-    uint32_t num_rows_remaining_of_current_image = (v.img_h == 0) ? 0 : img_height - v.img_h;
-    if (num_rows_remaining_of_current_image > 0) {
-        uint32_t num_rows_to_skip = v.next_img_h - v.img_h;
-        uint32_t output_h_from_remaining_rows_of_current_image =
-            std::ceil((double)(num_rows_remaining_of_current_image - num_rows_to_skip) / (double)img_stride_h) *
-            output_img_width;
-        bool output_for_partial_top_image =
-            v.output_flat_h + output_h_from_remaining_rows_of_current_image <= output_end_flat_h + 1;
-        bool input_for_partial_top_image =
-            v.input_flat_h + (num_rows_remaining_of_current_image * img_width) <= input_end_flat_h + 1;
-        if (output_for_partial_top_image && input_for_partial_top_image) {
-            // Top partial image section
-            num_rows_top_partial_image = img_height - v.img_h;
-            num_skip_rows_top_partial_image = v.next_img_h - v.img_h;
-            // Sanity check
-            TT_ASSERT((v.img_h + num_rows_top_partial_image == img_height));
-            v.img_h = 0;
-            v.next_img_h = 0;
-            v.input_flat_h += (num_rows_top_partial_image * img_width);
-            v.output_flat_h += output_h_from_remaining_rows_of_current_image;
-            TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
-        }
-        TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
-    }
-    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
-
-    if (v.img_h == 0 && v.img_w == 0) {
-        // Check for full images
-        while (1) {
-            bool output_for_current_full_image =
-                v.output_flat_h + (output_img_height * output_img_width) <= output_end_flat_h + 1;
-            bool input_for_current_full_image = v.input_flat_h + (img_height * img_width) <= input_end_flat_h + 1;
-            if (!output_for_current_full_image || !input_for_current_full_image) {
-                break;
-            }
-            v.input_flat_h += (img_height * img_width);
-            v.img_h = 0;
-            v.img_w = 0;
-            v.next_img_h = 0;
-            v.next_img_w = 0;
-            num_full_images += 1;
-            v.output_flat_h += (output_img_height * output_img_width);
-        }
-        TT_ASSERT(v.img_h == 0 && v.img_w == 0 && v.next_img_h == 0 && v.next_img_w == 0);
-    }
-
-    // Sanity check
-    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
-    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
-
-    bool found_first_unskipped_row_in_bottom_partial_imgage = false;
-    // check for bottom partial image rows
-    while (1) {
-        bool output_for_bottom_partial_image_row = (v.next_img_h == v.img_h)
-                                                       ? (v.output_flat_h + output_img_width <= output_end_flat_h + 1)
-                                                       : true;  // true for skipped row
-        bool input_for_bottom_partial_image_row = v.input_flat_h + img_width <= input_end_flat_h + 1;
-        if (!output_for_bottom_partial_image_row || !input_for_bottom_partial_image_row) {
-            break;
-        }
-        if (!found_first_unskipped_row_in_bottom_partial_imgage) {
-            if (v.next_img_h == v.img_h) {
-                found_first_unskipped_row_in_bottom_partial_imgage = true;
-            } else {
-                TT_ASSERT(v.next_img_h > v.img_h);
-                num_skip_rows_bottom_partial_image += 1;
-            }
-        }
-        v.input_flat_h += img_width;
-        if (v.next_img_h == v.img_h) {
-            v.output_flat_h += output_img_width;
-        }
-        v.img_w = 0;
-        v.next_img_w = 0;
-        TT_ASSERT(v.img_h < img_height - 1);  // this is supposed to be a bottom partial image
-        v.img_h += 1;
-        if (v.next_img_h < v.img_h) {
-            v.next_img_h += img_stride_h;
-            TT_ASSERT(v.next_img_h <= img_height);  // odd heights and odd size sharding with stride > 1 not supported
-            if (v.next_img_h == img_height && v.img_h == img_height) {
-                v.next_img_h = 0;
-                v.img_h = 0;
-                break;
-            }
-        }
-        num_rows_bottom_partial_image += 1;
-    }
-
-    // Sanity check
-    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
-    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
-    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
-
-    // check if there is a bottom partial left aligned row
-    if (v.input_flat_h <= input_end_flat_h && v.output_flat_h <= output_end_flat_h) {
-        TT_ASSERT(v.img_w == 0 && v.next_img_w == 0);
-        // bottom partial left aligned row width can be split between 2 cores
-        uint32_t input_remaining = input_end_flat_h - v.input_flat_h + 1;
-        uint32_t output_remaining = output_end_flat_h - v.output_flat_h + 1;
-        TT_ASSERT(
-            output_remaining < output_img_width ||
-            input_remaining < img_width);  // there must be a partial width either on input side or output side
-        bottom_partial_left_aligned_row_width = input_remaining;
-        if (output_remaining < output_img_width) {
-            bottom_partial_left_aligned_row_width = std::min(input_remaining, output_remaining * img_stride_w);
-        }
-        // sanity
-        TT_ASSERT(bottom_partial_left_aligned_row_width < img_width);
-        TT_ASSERT(v.next_img_h >= v.img_h);
-        skip_bottom_partial_left_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
-        while (v.img_w < bottom_partial_left_aligned_row_width) {
-            v.img_w += 1;
-            if (v.next_img_w < v.img_w) {
-                v.next_img_w += img_stride_w;
-                TT_ASSERT(v.next_img_w < img_width);  // odd widths and odd size sharding with stride > 1 not supported
-            }
-        }
-        TT_ASSERT(v.img_w == bottom_partial_left_aligned_row_width && v.next_img_w >= v.img_w);
-        v.input_flat_h += bottom_partial_left_aligned_row_width;
-        if (!skip_bottom_partial_left_aligned_row) {
-            v.output_flat_h += std::ceil((double)bottom_partial_left_aligned_row_width / (double)img_stride_w);
-        }
-    }
-    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
-
-    if (0) {
-        log_debug(tt::LogOp, "   top_partial_middle_aligned_row_width: {}", top_partial_middle_aligned_row_width);
-        log_debug(tt::LogOp, "   skip_top_partial_middle_aligned_row: {}", skip_top_partial_middle_aligned_row);
-        log_debug(tt::LogOp, "   top_partial_right_aligned_row_width: {}", top_partial_right_aligned_row_width);
-        log_debug(tt::LogOp, "   skip_top_partial_right_aligned_row: {}", skip_top_partial_right_aligned_row);
-        log_debug(tt::LogOp, "   num_rows_top_partial_image: {}", num_rows_top_partial_image);
-        log_debug(tt::LogOp, "   num_skip_rows_top_partial_image: {}", num_skip_rows_top_partial_image);
-        log_debug(tt::LogOp, "   num_full_images: {}", num_full_images);
-        log_debug(tt::LogOp, "   num_rows_bottom_partial_image: {}", num_rows_bottom_partial_image);
-        log_debug(tt::LogOp, "   num_skip_rows_bottom_partial_image: {}", num_skip_rows_bottom_partial_image);
-        log_debug(tt::LogOp, "   bottom_partial_left_aligned_row_width: {}", bottom_partial_left_aligned_row_width);
-        log_debug(tt::LogOp, "   skip_bottom_partial_left_aligned_row: {}", skip_bottom_partial_left_aligned_row);
-        log_debug(tt::LogOp, "   v.input_flat_h: {}", v.input_flat_h);
-        log_debug(tt::LogOp, "   v.output_flat_h: {}", v.output_flat_h);
-        log_debug(tt::LogOp, "   input_end_flat_h: {}", input_end_flat_h);
-        log_debug(tt::LogOp, "   output_end_flat_h: {}", output_end_flat_h);
-        // cout << "img_h=" << v.img_h << ", img_w=" << v.img_w << ", next_img_h=" << v.next_img_h << ", next_img_w=" <<
-        // v.img_w << endl;
-    }
-
-    // Sanity check
-    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
-    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
-
-    if (v.input_flat_h == input_end_flat_h + 1) {
-        v.input_flat_h = 0;
-    }
-    if (v.output_flat_h == output_end_flat_h + 1) {
-        v.output_flat_h = 0;
-    }
-    return DownsampleReadPatternParams{
-        .top_partial_middle_aligned_row_width = top_partial_middle_aligned_row_width,
-        .skip_top_partial_middle_aligned_row = skip_top_partial_middle_aligned_row,
-        .top_partial_right_aligned_row_width = top_partial_right_aligned_row_width,
-        .skip_top_partial_right_aligned_row = skip_top_partial_right_aligned_row,
-        .num_rows_top_partial_image = num_rows_top_partial_image,
-        .num_skip_rows_top_partial_image = num_skip_rows_top_partial_image,
-        .num_full_images = num_full_images,
-        .num_rows_bottom_partial_image = num_rows_bottom_partial_image,
-        .num_skip_rows_bottom_partial_image = num_skip_rows_bottom_partial_image,
-        .bottom_partial_left_aligned_row_width = bottom_partial_left_aligned_row_width,
-        .skip_bottom_partial_left_aligned_row = skip_bottom_partial_left_aligned_row};
-}
-
-operation::ProgramWithCallbacks downsample_single_core(
-    const Tensor& a, std::array<uint32_t, 5> downsample_params, Tensor& output) {
-              tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-    tt::DataFormat untilized_cb_data_format = tt::DataFormat::Float16_b;
-    uint32_t untilized_single_tile_size = tt::tt_metal::detail::TileSize(untilized_cb_data_format);
-    auto [img_batch_size, img_height, img_width, img_stride_h, img_stride_w] = downsample_params;
-    tt::tt_metal::Buffer* src0_buffer = a.buffer();
-
-    TT_ASSERT(a.get_legacy_shape()[0] == 1 && a.get_legacy_shape()[1] == 1);
-    TT_ASSERT(output.get_legacy_shape()[0] == 1 && output.get_legacy_shape()[1] == 1);
-
-    tt::tt_metal::Device* device = a.device();
-
-    tt::tt_metal::Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-    // Sanity check of output size
-    TT_ASSERT(output.volume() % TILE_HW == 0);
-    uint32_t unpadded_input_volume = img_batch_size * img_height * img_width;
-    TT_ASSERT(a.volume() >= unpadded_input_volume);
-    uint32_t unpadded_output_volume = ceil((double)unpadded_input_volume / (double)(img_stride_h * img_stride_w));
-    TT_ASSERT(output.volume() >= unpadded_output_volume);
-
-    uint32_t ncores_x_full_grid = device->compute_with_storage_grid_size().x;
-    auto [num_cores_height_sliced, num_cores_width_sliced] = get_num_cores_height_width_sliced(
-        a.shard_spec().value().grid, a.memory_config().memory_layout, a.shard_spec().value().orientation);
-    uint32_t num_cores = num_cores_height_sliced * num_cores_width_sliced;
-    auto all_cores = a.shard_spec().value().grid;
-    auto memory_layout = a.memory_config().memory_layout;
-    TT_ASSERT(all_cores == output.shard_spec().value().grid);
-    TT_ASSERT(memory_layout == output.memory_config().memory_layout);
-    TT_ASSERT(
-        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
-    if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
-        TT_ASSERT(all_cores.ranges().size() == 1);
-    } else {
-        TT_ASSERT(num_cores_width_sliced == 1);
-    }
-    uint32_t num_cores_x =
-        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED ? ncores_x_full_grid : num_cores_height_sliced;
-
-    auto core_range = all_cores;
-
-    uint32_t input_height =
-        a.get_legacy_shape()[2];  // input height == flattened face of input image, multiple images are stacked in H dim
-    uint32_t input_width = a.get_legacy_shape()[3];         // input width == input image # of channels
-    uint32_t output_height = output.get_legacy_shape()[2];  // output height == flattened face of output image, multiple
-                                                            // images are stacked in H dim
-    uint32_t output_width = output.get_legacy_shape()[3];
-    TT_ASSERT(input_width == output_width);
-
-    uint32_t input_height_unpadded = img_batch_size * img_height * img_width;
-    TT_ASSERT(input_height >= input_height_unpadded);
-    uint32_t output_height_unpadded =
-        img_batch_size * std::ceil((double)(img_height * img_width) / (double)(img_stride_h * img_stride_w));
-    TT_ASSERT(output_height >= output_height_unpadded);
-
-    uint32_t input_shard_height = a.shard_spec().value().shape[0];
-    TT_ASSERT(input_shard_height * num_cores_height_sliced >= input_height);  // last core shard size may be padded
-    uint32_t input_height_padded = input_shard_height * num_cores_height_sliced;
-    uint32_t input_shard_width = a.shard_spec().value().shape[1];
-    TT_ASSERT(input_shard_width * num_cores_width_sliced == input_width);
-
-    uint32_t input_height_padding = input_height_padded - input_height_unpadded;
-    TT_ASSERT(input_height_padding < input_shard_height);  // last core has padding
-    uint32_t last_core_input_shard_height_unpadded = input_shard_height - input_height_padding;
-
-    uint32_t output_shard_height = output.shard_spec().value().shape[0];
-    uint32_t output_shard_width = output.shard_spec().value().shape[1];
 
-    TT_ASSERT(output_shard_width == input_shard_width);
-    TT_ASSERT(output_shard_height * num_cores_height_sliced >= output_height);  // last core shard size may be padded
-    uint32_t output_height_padded = output_shard_height * num_cores_height_sliced;
-    uint32_t output_height_padding = output_height_padded - output_height_unpadded;
-    TT_ASSERT(output_height_padding < output_shard_height);
-    uint32_t last_core_output_shard_height_unpadded = output_shard_height - output_height_padding;
 
-    uint32_t input_shard_width_bytes = input_shard_width * datum_size(untilized_cb_data_format);
 
-    TT_ASSERT(input_shard_width % TILE_WIDTH == 0);
-    uint32_t num_input_tiles_in_row = input_shard_width / TILE_WIDTH;
-    TT_ASSERT(input_shard_height % TILE_HEIGHT == 0);
-    uint32_t num_rows_of_input_tiles = input_shard_height / TILE_HEIGHT;
 
-    uint32_t num_output_tiles_in_row = num_input_tiles_in_row;
-    TT_ASSERT(output_shard_height % TILE_HEIGHT == 0);
-    uint32_t num_rows_of_output_tiles = output_shard_height / TILE_HEIGHT;
 
-    uint32_t input_cb_index = tt::CB::c_in0;
-    uint32_t num_input_tiles = num_input_tiles_in_row * num_rows_of_input_tiles;
-    tt::tt_metal::CircularBufferConfig input_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_input_tiles * input_single_tile_size, {{input_cb_index, input_cb_data_format}})
-            .set_page_size(input_cb_index, input_single_tile_size);
-    input_cb_config = input_cb_config.set_globally_allocated_address(*a.buffer());
-    auto input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, input_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        input_cb_index,
-        input_single_tile_size,
-        num_input_tiles,
-        input_single_tile_size * num_input_tiles);
-
-    // CB to store halo data
-    // hardcode to store 1 row of tiles
-    uint32_t halo_prev_input_cb_index = tt::CB::c_in1;
-    uint32_t halo_prev_input_cb_max_rows_of_tiles = 4;
-    uint32_t num_halo_prev_cb_input_tiles = num_input_tiles_in_row * halo_prev_input_cb_max_rows_of_tiles;
-    tt::tt_metal::CircularBufferConfig halo_prev_input_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_halo_prev_cb_input_tiles * input_single_tile_size, {{halo_prev_input_cb_index, input_cb_data_format}})
-            .set_page_size(halo_prev_input_cb_index, input_single_tile_size);
-    auto halo_prev_input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, halo_prev_input_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        halo_prev_input_cb_index,
-        input_single_tile_size,
-        num_halo_prev_cb_input_tiles,
-        input_single_tile_size * num_halo_prev_cb_input_tiles);
-
-    uint32_t halo_next_input_cb_index = tt::CB::c_in2;
-    uint32_t halo_next_input_cb_max_rows_of_tiles = 33;  // TODO: Remove hardcoding
-    uint32_t num_halo_next_cb_input_tiles = num_input_tiles_in_row * halo_next_input_cb_max_rows_of_tiles;
-    tt::tt_metal::CircularBufferConfig halo_next_input_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_halo_next_cb_input_tiles * input_single_tile_size, {{halo_next_input_cb_index, input_cb_data_format}})
-            .set_page_size(halo_next_input_cb_index, input_single_tile_size);
-    auto halo_next_input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, halo_next_input_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        halo_next_input_cb_index,
-        input_single_tile_size,
-        num_halo_next_cb_input_tiles,
-        input_single_tile_size * num_halo_next_cb_input_tiles);
-
-    // CB to store reader pattern array
-    // read pattern array size == output_height
-    uint32_t reader_pattern_array_size = output_shard_height;
-    uint32_t reader_pattern_array_cb_index = tt::CB::c_intermed1;
-    tt::tt_metal::CircularBufferConfig reader_pattern_array_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            reader_pattern_array_size * 4, {{reader_pattern_array_cb_index, tt::DataFormat::Float16_b}})
-            .set_page_size(reader_pattern_array_cb_index, 4);
-    auto reader_pattern_array_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, reader_pattern_array_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        reader_pattern_array_cb_index,
-        4,
-        reader_pattern_array_size,
-        4 * reader_pattern_array_size);
-
-    // untilized CB has size - [32, full width]
-    uint32_t untilize_cb_index = tt::CB::c_intermed2;
-    uint32_t num_tiles_untilize_cb = num_input_tiles_in_row;
-    tt::tt_metal::CircularBufferConfig untilize_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_tiles_untilize_cb * untilized_single_tile_size, {{untilize_cb_index, untilized_cb_data_format}})
-            .set_page_size(untilize_cb_index, untilized_single_tile_size);
-    auto untilize_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, untilize_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        untilize_cb_index,
-        untilized_single_tile_size,
-        num_tiles_untilize_cb,
-        untilized_single_tile_size * num_tiles_untilize_cb);
-
-    uint32_t num_output_tiles = num_output_tiles_in_row * num_rows_of_output_tiles;
-    uint32_t untilize_downsampled_cb_index = tt::CB::c_intermed3;
-    uint32_t num_tiles_untilize_downsampled_cb =
-        num_output_tiles;  // untilize downsampled cb size == output size per core
-    tt::tt_metal::CircularBufferConfig untilize_downsampled_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_tiles_untilize_downsampled_cb * untilized_single_tile_size,
-            {{untilize_downsampled_cb_index, untilized_cb_data_format}})
-            .set_page_size(untilize_downsampled_cb_index, untilized_single_tile_size);
-    auto untilize_downsampled_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, untilize_downsampled_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        untilize_downsampled_cb_index,
-        untilized_single_tile_size,
-        num_tiles_untilize_downsampled_cb,
-        untilized_single_tile_size * num_tiles_untilize_downsampled_cb);
-
-    uint32_t final_tilize_output_cb_index = tt::CB::c_out0;
-    uint32_t num_tiles_final_tilize_output_cb = num_output_tiles;  // final output cb size == output size per core
-    tt::tt_metal::CircularBufferConfig final_tilize_output_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_tiles_final_tilize_output_cb * output_single_tile_size,
-            {{final_tilize_output_cb_index, output_cb_data_format}})
-            .set_page_size(final_tilize_output_cb_index, output_single_tile_size);
-    final_tilize_output_cb_config = final_tilize_output_cb_config.set_globally_allocated_address(*output.buffer());
-    auto final_tilize_output_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, final_tilize_output_cb_config);
-    log_debug(
-        tt::LogOp,
-        "CB {}: PS = {} NP = {} :: TOTAL = {}",
-        final_tilize_output_cb_index,
-        output_single_tile_size,
-        num_tiles_final_tilize_output_cb,
-        output_single_tile_size * num_tiles_final_tilize_output_cb);
-
-    uint32_t log_base_2_of_conv_act_size_c_bytes = (uint32_t)std::log2((float)input_shard_width_bytes);
-    uint32_t stride_h_x_image_width = img_stride_h * img_width;
-    std::vector<uint32_t> writer_compile_time_args = {
-        (std::uint32_t)untilize_cb_index,
-        (std::uint32_t)untilize_downsampled_cb_index,
-        (std::uint32_t)final_tilize_output_cb_index,
-        (std::uint32_t)reader_pattern_array_cb_index,
-        (std::uint32_t)datum_size(untilized_cb_data_format),
-        (std::uint32_t)input_shard_width_bytes,
-        (std::uint32_t)halo_prev_input_cb_index,
-        (std::uint32_t)halo_next_input_cb_index,
-        log_base_2_of_conv_act_size_c_bytes,
-        stride_h_x_image_width};
-
-    // Writer to downsample - drops rows from untilized cb
-    tt::tt_metal::KernelHandle downsample_writer_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_writer_kernel.cpp",
-        core_range,
-        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
-
-    vector<uint32_t> compute_args = {
-        input_cb_index,
-        halo_prev_input_cb_index,
-        halo_next_input_cb_index,
-        untilize_cb_index,
-        untilize_downsampled_cb_index,
-        final_tilize_output_cb_index,
-        num_input_tiles_in_row,
-        num_rows_of_output_tiles,
-        num_output_tiles_in_row,
-    };
-    string compute_kernel = "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_compute_kernel.cpp";
-    if (num_input_tiles_in_row <= MAX_PACK_UNTILIZE_WIDTH) {
-        compute_kernel =
-            "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_fast_pack_untilize_compute_kernel.cpp";
-    }
-    auto downsample_compute_kernel_id = tt::tt_metal::CreateKernel(
-        program, compute_kernel, core_range, tt::tt_metal::ComputeConfig{.compile_args = compute_args});
-
-    // track img h, img w, across cores
-    ImgTrackingVars v;
-    CoreCoord prev_core = {0, 0};
-    bool input_flat_h_is_of_current_core = true;
-    const auto& cores = corerange_to_cores(all_cores, std::nullopt, true);
-    for (uint32_t i = 0; i < num_cores; i++) {
-        const CoreCoord& core = cores[i];
-        // cout << "i=" << i << endl;
-
-        uint32_t input_end_flat_h = input_shard_height - 1;
-        uint32_t output_end_flat_h = output_shard_height - 1;
-
-        bool halo_prev_read_enabled = false;
-        DownsampleReadPatternParams halo_prev_read_pattern_params;
-        uint32_t halo_prev_noc_x = 0;
-        uint32_t halo_prev_noc_y = 0;
-        uint32_t halo_prev_start_addr = 0;
-        uint32_t halo_prev_addr_offset = 0;
-        uint32_t halo_prev_num_tiles = 0;
-        uint32_t halo_prev_size_bytes = 0;
-        uint32_t halo_prev_input_num_rows_of_tiles = 0;
-        uint32_t halo_prev_read_pattern_offset = 0;
-
-        bool halo_next_read_enabled = false;
-        DownsampleReadPatternParams halo_next_read_pattern_params;
-        uint32_t halo_next_noc_x = 0;
-        uint32_t halo_next_noc_y = 0;
-        uint32_t halo_next_start_addr = 0;
-        uint32_t halo_next_addr_offset = 0;
-        uint32_t halo_next_num_tiles = 0;
-        uint32_t halo_next_size_bytes = 0;
-        uint32_t halo_next_input_num_rows_of_tiles = 0;
-        uint32_t halo_next_read_pattern_offset = 0;
-        uint32_t local_read_pattern_offset = 0;
-        uint32_t current_core_input_end_flat_h = input_end_flat_h;
-        uint32_t next_core_input_end_flat_h = input_end_flat_h;
-        if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
-            if (i % num_cores_x == 0) {
-                // first core in row
-                // reset
-                v.input_flat_h = 0;
-                v.output_flat_h = 0;
-            } else if (i % num_cores_x == num_cores_x - 1) {
-                // set unpadded height as end index for last core in row
-                current_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
-                output_end_flat_h = last_core_output_shard_height_unpadded - 1;
-            } else if (i % num_cores_x == num_cores_x - 2) {
-                next_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
-            }
-        } else if (i == num_cores - 1) {
-            // for height sharding, set unpadded height as end index for last core
-            current_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
-            output_end_flat_h = last_core_output_shard_height_unpadded - 1;
-        } else if (i == num_cores - 2) {
-            next_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
-        }
-        if (v.input_flat_h != 0 && !input_flat_h_is_of_current_core) {
-            // halo region of previous core
-            TT_ASSERT(i != 0);
-            if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
-                TT_ASSERT(prev_core.y == core.y);  // for block sharding case, prev core is left core
-            }
-            halo_prev_read_enabled = true;
-            TT_ASSERT(v.input_flat_h < input_shard_height);
-            // get halo start tile address from height idx
-            uint32_t halo_prev_start_tile_id_h = v.input_flat_h / TILE_HEIGHT;
-            TT_ASSERT(
-                input_shard_height - v.input_flat_h <=
-                TILE_HEIGHT *
-                    halo_prev_input_cb_max_rows_of_tiles);  // halo input cb is hardcoded to store only 4 rows of tiles
-                                                            // for now. TODO: allocate bigger CB or read in blocks
-            // get halo size
-            halo_prev_size_bytes = (input_shard_height - (halo_prev_start_tile_id_h * TILE_HEIGHT)) *
-                                   input_shard_width / TILE_HW * input_single_tile_size;
-            TT_ASSERT(halo_prev_size_bytes % input_single_tile_size == 0);
-            halo_prev_num_tiles = halo_prev_size_bytes / input_single_tile_size;
-            TT_ASSERT(halo_prev_num_tiles <= num_halo_prev_cb_input_tiles);
-            TT_ASSERT(halo_prev_num_tiles % num_input_tiles_in_row == 0);
-            halo_prev_input_num_rows_of_tiles = halo_prev_num_tiles / num_input_tiles_in_row;
-            halo_prev_addr_offset = num_input_tiles_in_row * halo_prev_start_tile_id_h * input_single_tile_size;
-            halo_prev_start_addr = GetCircularBufferConfig(program, input_cb).globally_allocated_address().value();
-
-            TT_ASSERT(
-                (halo_prev_start_addr + halo_prev_addr_offset) % 32 == 0);  // read address should be 32 byte aligned
-            auto halo_noc_coords = device->worker_core_from_logical_core(prev_core);
-            halo_prev_noc_x = halo_noc_coords.x;
-            halo_prev_noc_y = halo_noc_coords.y;
-            TT_ASSERT(v.input_flat_h >= halo_prev_start_tile_id_h * TILE_HEIGHT);
-            halo_prev_read_pattern_offset = v.input_flat_h - (halo_prev_start_tile_id_h * TILE_HEIGHT);
-            local_read_pattern_offset = halo_prev_input_num_rows_of_tiles * TILE_HEIGHT;
-            halo_prev_read_pattern_params = generate_downsample_read_pattern(
-                v, img_height, img_width, img_stride_h, img_stride_w, input_end_flat_h, output_end_flat_h, true, false);
-        }
-        // local core
-        TT_ASSERT(v.output_flat_h < output_shard_height);
-        uint32_t local_start_h = v.input_flat_h;
-        DownsampleReadPatternParams local_read_pattern_params = generate_downsample_read_pattern(
-            v,
-            img_height,
-            img_width,
-            img_stride_h,
-            img_stride_w,
-            current_core_input_end_flat_h,
-            output_end_flat_h,
-            false,
-            false);
-        TT_ASSERT(v.output_flat_h <= output_shard_height);
-        uint32_t local_end_h_exclusive = v.input_flat_h == 0 ? input_shard_height : v.input_flat_h;
-        uint32_t local_num_rows = local_end_h_exclusive - local_start_h;
-        TT_ASSERT(local_num_rows > 0);
-        uint32_t local_input_num_rows_of_tiles = std::ceil((double)local_num_rows / (double)TILE_HEIGHT);
-        uint32_t local_input_offset_rows_of_tiles = local_start_h / TILE_HEIGHT;
-        if (local_start_h != 0) {
-            TT_ASSERT(local_read_pattern_offset == 0);
-            local_read_pattern_offset = local_start_h % TILE_HEIGHT;
-        }
-        if (v.input_flat_h != 0) {
-            input_flat_h_is_of_current_core = false;
-        } else {
-            input_flat_h_is_of_current_core = true;  // updating flag for next core
-        }
-        TT_ASSERT(local_input_num_rows_of_tiles <= num_rows_of_input_tiles);
-
-        if (v.output_flat_h != 0) {
-            // need to read halo from next core
-            TT_ASSERT(i != num_cores - 1);
-            TT_ASSERT(v.input_flat_h == 0);
-            const CoreCoord& next_core = cores[i + 1];
-            if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
-                TT_ASSERT(next_core.y == core.y);  // for block sharding case, next core is right core
-            }
-            halo_next_read_enabled = true;
-            halo_next_read_pattern_params = generate_downsample_read_pattern(
-                v,
-                img_height,
-                img_width,
-                img_stride_h,
-                img_stride_w,
-                next_core_input_end_flat_h,
-                output_end_flat_h,
-                false,
-                true);
-            TT_ASSERT(v.output_flat_h == 0);
-            TT_ASSERT(v.input_flat_h != 0 && v.input_flat_h < input_shard_height);
-            TT_ASSERT(
-                v.input_flat_h <= TILE_HEIGHT * halo_next_input_cb_max_rows_of_tiles,
-                "v.input_flat_h ({}) should be <= TILE_HEIGHT * halo_next_input_cb_max_rows_of_tiles ({})",
-                v.input_flat_h,
-                halo_next_input_cb_max_rows_of_tiles);  // halo next input cb is hardcoded to store only 5 rows of tiles
-                                                        // for now. TODO: allocate bigger CB or read in blocks
-            uint32_t halo_next_end_tile_id_h = v.input_flat_h / TILE_HEIGHT;
-            // get halo size
-            halo_next_size_bytes =
-                (halo_next_end_tile_id_h + 1) * TILE_HEIGHT * input_shard_width / TILE_HW * input_single_tile_size;
-            TT_ASSERT(halo_next_size_bytes % input_single_tile_size == 0);
-            halo_next_num_tiles = halo_next_size_bytes / input_single_tile_size;
-            TT_ASSERT(halo_next_num_tiles <= num_halo_next_cb_input_tiles);
-            TT_ASSERT(halo_next_num_tiles % num_input_tiles_in_row == 0);
-            halo_next_input_num_rows_of_tiles = halo_next_num_tiles / num_input_tiles_in_row;
-            halo_next_addr_offset = 0;
-            halo_next_start_addr = GetCircularBufferConfig(program, input_cb).globally_allocated_address().value();
-            TT_ASSERT(
-                (halo_next_start_addr + halo_next_addr_offset) % 32 == 0);  // read address should be 32 byte aligned
-            auto halo_noc_coords = device->worker_core_from_logical_core(next_core);
-            halo_next_noc_x = halo_noc_coords.x;
-            halo_next_noc_y = halo_noc_coords.y;
-            TT_ASSERT(halo_prev_input_num_rows_of_tiles == 0);
-            halo_next_read_pattern_offset = local_input_num_rows_of_tiles * TILE_HEIGHT;
-        }
-        TT_ASSERT(v.output_flat_h == 0);
-
-        // Compile runtime args
-        vector<uint32_t> compile_rt_kernel_args = {
-            local_input_num_rows_of_tiles,
-            local_input_offset_rows_of_tiles,
-            halo_prev_read_enabled,
-            halo_prev_input_num_rows_of_tiles,
-            halo_next_read_enabled,
-            halo_next_input_num_rows_of_tiles,
-        };
-
-        tt::tt_metal::SetRuntimeArgs(program, downsample_compute_kernel_id, core, compile_rt_kernel_args);
-
-        // Writer runtime args
-        vector<uint32_t> writer_kernel_args = {
-            (uint32_t)img_height,
-            (uint32_t)img_width,
-            (uint32_t)img_stride_h,
-            (uint32_t)img_stride_w,
-
-            // halo prev args
-            halo_prev_read_enabled,
-            halo_prev_noc_x,
-            halo_prev_noc_y,
-            halo_prev_num_tiles,
-            halo_prev_start_addr,
-            halo_prev_addr_offset,
-            halo_prev_size_bytes,
-
-            // halo prev read pattern args
-            halo_prev_read_pattern_offset,
-            halo_prev_read_pattern_params.top_partial_middle_aligned_row_width,
-            halo_prev_read_pattern_params.skip_top_partial_middle_aligned_row,
-            halo_prev_read_pattern_params.top_partial_right_aligned_row_width,
-            halo_prev_read_pattern_params.skip_top_partial_right_aligned_row,
-            halo_prev_read_pattern_params.num_rows_top_partial_image,
-            halo_prev_read_pattern_params.num_skip_rows_top_partial_image,
-            halo_prev_read_pattern_params.num_full_images,
-            halo_prev_read_pattern_params.num_rows_bottom_partial_image,
-            halo_prev_read_pattern_params.num_skip_rows_bottom_partial_image,
-            halo_prev_read_pattern_params.bottom_partial_left_aligned_row_width,
-            halo_prev_read_pattern_params.skip_bottom_partial_left_aligned_row,
-
-            // local read pattern args
-            local_read_pattern_offset,
-            local_read_pattern_params.top_partial_middle_aligned_row_width,
-            local_read_pattern_params.skip_top_partial_middle_aligned_row,
-            local_read_pattern_params.top_partial_right_aligned_row_width,
-            local_read_pattern_params.skip_top_partial_right_aligned_row,
-            local_read_pattern_params.num_rows_top_partial_image,
-            local_read_pattern_params.num_skip_rows_top_partial_image,
-            local_read_pattern_params.num_full_images,
-            local_read_pattern_params.num_rows_bottom_partial_image,
-            local_read_pattern_params.num_skip_rows_bottom_partial_image,
-            local_read_pattern_params.bottom_partial_left_aligned_row_width,
-            local_read_pattern_params.skip_bottom_partial_left_aligned_row,
-
-            // halo next core args
-            halo_next_read_enabled,
-            halo_next_noc_x,
-            halo_next_noc_y,
-            halo_next_num_tiles,
-            halo_next_start_addr,
-            halo_next_addr_offset,
-            halo_next_size_bytes,
-
-            // halo next read pattern args
-            halo_next_read_pattern_offset,
-            halo_next_read_pattern_params.top_partial_middle_aligned_row_width,
-            halo_next_read_pattern_params.skip_top_partial_middle_aligned_row,
-            halo_next_read_pattern_params.top_partial_right_aligned_row_width,
-            halo_next_read_pattern_params.skip_top_partial_right_aligned_row,
-            halo_next_read_pattern_params.num_rows_top_partial_image,
-            halo_next_read_pattern_params.num_skip_rows_top_partial_image,
-            halo_next_read_pattern_params.num_full_images,
-            halo_next_read_pattern_params.num_rows_bottom_partial_image,
-            halo_next_read_pattern_params.num_skip_rows_bottom_partial_image,
-            halo_next_read_pattern_params.bottom_partial_left_aligned_row_width,
-            halo_next_read_pattern_params.skip_bottom_partial_left_aligned_row,
-
-            halo_prev_input_num_rows_of_tiles + local_input_num_rows_of_tiles + halo_next_input_num_rows_of_tiles,
-            num_input_tiles_in_row,
-            num_output_tiles,
-
-            (uint32_t) false};
-
-        tt::tt_metal::SetRuntimeArgs(program, downsample_writer_kernel_id, core, writer_kernel_args);
-        prev_core = core;
-    }
-
-    auto override_runtime_args_callback = [input_cb = input_cb,
-                                           final_tilize_output_cb = final_tilize_output_cb,
-                                           downsample_writer_kernel_id = downsample_writer_kernel_id,
-                                           cores = cores](
-                                              const void* operation,
-                                              Program& program,
-                                              const std::vector<Tensor>& input_tensors,
-                                              const std::vector<std::optional<const Tensor>>& optional_input_tensors,
-                                              const std::vector<Tensor>& output_tensors) {
-        auto src_buffer = input_tensors.at(0).buffer();
-        auto dst_buffer = output_tensors.at(0).buffer();
-
-        UpdateDynamicCircularBufferAddress(program, input_cb, *src_buffer);
-        UpdateDynamicCircularBufferAddress(program, final_tilize_output_cb, *dst_buffer);
-
-        auto& writer_runtime_args_by_core = GetRuntimeArgs(program, downsample_writer_kernel_id);
-        for (const auto& core : cores) {
-            auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
-            runtime_args[8] = src_buffer->address();
-            runtime_args[39] = src_buffer->address();
-        }
-    };
-
-    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
-}
 
-}  //name space data_movement
-}  // namespace operations
 
-}  // namespace ttnn
+}  //name space ttnn::operations::data_movement
diff --git a/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.cpp
new file mode 100644
index 00000000000..5c6bb5a9e88
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.cpp
@@ -0,0 +1,877 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "downsample_program_factory.hpp"
+
+#include <math.h>
+
+#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/untilize/untilize_op.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+
+using namespace tt::constants;
+
+
+namespace ttnn::operations::data_movement::detail {
+
+std::pair<uint32_t, uint32_t> get_num_cores_height_width_sliced(
+    CoreRangeSet all_cores, TensorMemoryLayout memory_layout, ShardOrientation shard_orientation) {
+    TT_ASSERT(
+        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
+    if (memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
+        TT_ASSERT(shard_orientation == ShardOrientation::ROW_MAJOR);
+    } else {
+        TT_ASSERT(shard_orientation == ShardOrientation::COL_MAJOR);
+        TT_ASSERT(all_cores.ranges().size() == 1);
+    }
+    uint32_t num_cores = all_cores.num_cores();
+    auto first_core_range = *all_cores.ranges().begin();
+    uint32_t num_cores_height_sliced =
+        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED ? num_cores : first_core_range.end_coord.x + 1;
+    uint32_t num_cores_width_sliced = memory_layout == TensorMemoryLayout::HEIGHT_SHARDED
+                                          ? 1
+                                          : first_core_range.end_coord.y + 1;  // width is not sliced when height sharded
+    return {num_cores_height_sliced, num_cores_width_sliced};
+}
+
+
+struct DownsampleReadPatternParams {
+    uint32_t top_partial_middle_aligned_row_width;
+    uint32_t skip_top_partial_middle_aligned_row;
+    uint32_t top_partial_right_aligned_row_width;
+    uint32_t skip_top_partial_right_aligned_row;
+    uint32_t num_rows_top_partial_image;
+    uint32_t num_skip_rows_top_partial_image;
+    uint32_t num_full_images;
+    uint32_t num_rows_bottom_partial_image;
+    uint32_t num_skip_rows_bottom_partial_image;
+    uint32_t bottom_partial_left_aligned_row_width;
+    uint32_t skip_bottom_partial_left_aligned_row;
+};
+
+
+struct ImgTrackingVars {
+    uint32_t img_h = 0;
+    uint32_t img_w = 0;
+    uint32_t next_img_h = 0;  // img_h after stride
+    uint32_t next_img_w = 0;
+    uint32_t input_flat_h = 0;   // index within sharded input
+    uint32_t output_flat_h = 0;  // index within sharded output
+};
+
+DownsampleReadPatternParams generate_downsample_read_pattern(
+    ImgTrackingVars& v,
+    uint32_t img_height,
+    uint32_t img_width,
+    uint32_t img_stride_h,
+    uint32_t img_stride_w,
+    uint32_t input_end_flat_h,
+    uint32_t output_end_flat_h,
+    bool current_region_is_halo_prev_core,
+    bool current_region_is_halo_next_core) {
+    // Sanity checks at the start for local data
+    TT_ASSERT(v.next_img_h >= v.img_h);
+    TT_ASSERT(v.next_img_w == v.img_w);  // assumption that the start is picked and not skipped by stride
+    TT_ASSERT(v.img_h < img_height);
+    TT_ASSERT(v.next_img_w < img_width);
+    if (current_region_is_halo_prev_core) {
+        // cout << "GENERATING READ PATTERN FOR HALO REGION FROM PREVIOUS CORE" << endl;
+        TT_ASSERT(!current_region_is_halo_next_core);
+        TT_ASSERT(v.input_flat_h != 0);
+        TT_ASSERT(v.output_flat_h == 0);
+    } else if (current_region_is_halo_next_core) {
+        // cout << "GENERATING READ PATTERN FOR HALO REGION FROM NEXT CORE" << endl;
+        TT_ASSERT(!current_region_is_halo_prev_core);
+        TT_ASSERT(v.input_flat_h == 0);
+        TT_ASSERT(v.output_flat_h != 0);
+    } else {
+        // cout << "GENERATING READ PATTERN FOR LOCAL REGION" << endl;
+    }
+
+    // cout << "img_h=" << v.img_h << ", img_w=" << v.img_w << ", next_img_h=" << v.next_img_h << ", next_img_w=" <<
+    // v.img_w << endl; cout << "v.input_flat_h=" << v.input_flat_h << ", input_end_flat_h=" << input_end_flat_h << ",
+    // v.output_flat_h=" << v.output_flat_h << ", output_end_flat_h=" << output_end_flat_h << endl;
+
+    TT_ASSERT(v.input_flat_h < input_end_flat_h);
+    TT_ASSERT(v.output_flat_h < output_end_flat_h);
+
+    uint32_t output_img_height = std::ceil((double)img_height / (double)img_stride_h);
+    uint32_t output_img_width = std::ceil((double)img_width / (double)img_stride_w);
+    bool found_halo_for_next_core = false;
+
+    uint32_t top_partial_middle_aligned_row_width = 0;
+    uint32_t skip_top_partial_middle_aligned_row = 1;
+    uint32_t top_partial_right_aligned_row_width = 0;
+    uint32_t skip_top_partial_right_aligned_row = 1;
+    uint32_t num_rows_top_partial_image = 0;
+    uint32_t num_skip_rows_top_partial_image = 0;
+    uint32_t num_full_images = 0;
+    uint32_t num_rows_bottom_partial_image = 0;
+    uint32_t num_skip_rows_bottom_partial_image = 0;
+    uint32_t bottom_partial_left_aligned_row_width = 0;
+    uint32_t skip_bottom_partial_left_aligned_row = 1;
+    if (v.img_w != 0) {
+        // Check if its right aligned or middle aligned (special corner case for halo)
+        if (v.input_flat_h + img_width - v.img_w <= input_end_flat_h + 1) {
+            // top partial right aligned
+            top_partial_right_aligned_row_width = img_width - v.img_w;
+            skip_top_partial_right_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
+            v.input_flat_h += top_partial_right_aligned_row_width;
+            if (!skip_top_partial_right_aligned_row) {
+                v.output_flat_h += std::ceil((double)top_partial_right_aligned_row_width / (double)img_stride_w);
+                TT_ASSERT(v.output_flat_h <= output_end_flat_h);
+            }
+            v.img_w = 0;
+            v.next_img_w = 0;
+            if (v.img_h == img_height - 1) {
+                v.img_h = 0;
+                v.next_img_h = 0;
+            } else {
+                v.img_h += 1;
+                if (v.next_img_h < v.img_h) {
+                    v.next_img_h += img_stride_h;
+                }
+            }
+        } else {
+            // special corner case for halo region
+            // middle aligned
+            TT_ASSERT(input_end_flat_h - v.input_flat_h + 1 < img_width);
+            TT_ASSERT(current_region_is_halo_prev_core || current_region_is_halo_next_core);
+            // top partial middle aligned
+            top_partial_middle_aligned_row_width = input_end_flat_h - v.input_flat_h + 1;
+            skip_top_partial_middle_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
+            v.input_flat_h += top_partial_middle_aligned_row_width;
+            if (!skip_top_partial_middle_aligned_row) {
+                v.output_flat_h += std::ceil((double)top_partial_middle_aligned_row_width / (double)img_stride_w);
+                TT_ASSERT(v.output_flat_h <= output_end_flat_h);
+            }
+            uint32_t img_w_start = v.img_w;
+            while (v.img_w < img_w_start + top_partial_middle_aligned_row_width) {
+                v.img_w += 1;
+                if (v.next_img_w < v.img_w) {
+                    v.next_img_w += img_stride_w;
+                }
+            }
+            TT_ASSERT(v.img_w < img_width - 1);
+        }
+    }
+    TT_ASSERT(v.next_img_w == v.img_w);
+    TT_ASSERT(v.output_flat_h <= output_end_flat_h);
+    TT_ASSERT(v.next_img_h >= v.img_h);
+    if (v.img_w != 0) {
+        // special case for halo
+        TT_ASSERT(v.input_flat_h == input_end_flat_h + 1);
+    }
+    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
+
+    uint32_t num_rows_remaining_of_current_image = (v.img_h == 0) ? 0 : img_height - v.img_h;
+    if (num_rows_remaining_of_current_image > 0) {
+        uint32_t num_rows_to_skip = v.next_img_h - v.img_h;
+        uint32_t output_h_from_remaining_rows_of_current_image =
+            std::ceil((double)(num_rows_remaining_of_current_image - num_rows_to_skip) / (double)img_stride_h) *
+            output_img_width;
+        bool output_for_partial_top_image =
+            v.output_flat_h + output_h_from_remaining_rows_of_current_image <= output_end_flat_h + 1;
+        bool input_for_partial_top_image =
+            v.input_flat_h + (num_rows_remaining_of_current_image * img_width) <= input_end_flat_h + 1;
+        if (output_for_partial_top_image && input_for_partial_top_image) {
+            // Top partial image section
+            num_rows_top_partial_image = img_height - v.img_h;
+            num_skip_rows_top_partial_image = v.next_img_h - v.img_h;
+            // Sanity check
+            TT_ASSERT((v.img_h + num_rows_top_partial_image == img_height));
+            v.img_h = 0;
+            v.next_img_h = 0;
+            v.input_flat_h += (num_rows_top_partial_image * img_width);
+            v.output_flat_h += output_h_from_remaining_rows_of_current_image;
+            TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
+        }
+        TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
+    }
+    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
+
+    if (v.img_h == 0 && v.img_w == 0) {
+        // Check for full images
+        while (1) {
+            bool output_for_current_full_image =
+                v.output_flat_h + (output_img_height * output_img_width) <= output_end_flat_h + 1;
+            bool input_for_current_full_image = v.input_flat_h + (img_height * img_width) <= input_end_flat_h + 1;
+            if (!output_for_current_full_image || !input_for_current_full_image) {
+                break;
+            }
+            v.input_flat_h += (img_height * img_width);
+            v.img_h = 0;
+            v.img_w = 0;
+            v.next_img_h = 0;
+            v.next_img_w = 0;
+            num_full_images += 1;
+            v.output_flat_h += (output_img_height * output_img_width);
+        }
+        TT_ASSERT(v.img_h == 0 && v.img_w == 0 && v.next_img_h == 0 && v.next_img_w == 0);
+    }
+
+    // Sanity check
+    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
+    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
+
+    bool found_first_unskipped_row_in_bottom_partial_imgage = false;
+    // check for bottom partial image rows
+    while (1) {
+        bool output_for_bottom_partial_image_row = (v.next_img_h == v.img_h)
+                                                       ? (v.output_flat_h + output_img_width <= output_end_flat_h + 1)
+                                                       : true;  // true for skipped row
+        bool input_for_bottom_partial_image_row = v.input_flat_h + img_width <= input_end_flat_h + 1;
+        if (!output_for_bottom_partial_image_row || !input_for_bottom_partial_image_row) {
+            break;
+        }
+        if (!found_first_unskipped_row_in_bottom_partial_imgage) {
+            if (v.next_img_h == v.img_h) {
+                found_first_unskipped_row_in_bottom_partial_imgage = true;
+            } else {
+                TT_ASSERT(v.next_img_h > v.img_h);
+                num_skip_rows_bottom_partial_image += 1;
+            }
+        }
+        v.input_flat_h += img_width;
+        if (v.next_img_h == v.img_h) {
+            v.output_flat_h += output_img_width;
+        }
+        v.img_w = 0;
+        v.next_img_w = 0;
+        TT_ASSERT(v.img_h < img_height - 1);  // this is supposed to be a bottom partial image
+        v.img_h += 1;
+        if (v.next_img_h < v.img_h) {
+            v.next_img_h += img_stride_h;
+            TT_ASSERT(v.next_img_h <= img_height);  // odd heights and odd size sharding with stride > 1 not supported
+            if (v.next_img_h == img_height && v.img_h == img_height) {
+                v.next_img_h = 0;
+                v.img_h = 0;
+                break;
+            }
+        }
+        num_rows_bottom_partial_image += 1;
+    }
+
+    // Sanity check
+    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
+    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
+    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
+
+    // check if there is a bottom partial left aligned row
+    if (v.input_flat_h <= input_end_flat_h && v.output_flat_h <= output_end_flat_h) {
+        TT_ASSERT(v.img_w == 0 && v.next_img_w == 0);
+        // bottom partial left aligned row width can be split between 2 cores
+        uint32_t input_remaining = input_end_flat_h - v.input_flat_h + 1;
+        uint32_t output_remaining = output_end_flat_h - v.output_flat_h + 1;
+        TT_ASSERT(
+            output_remaining < output_img_width ||
+            input_remaining < img_width);  // there must be a partial width either on input side or output side
+        bottom_partial_left_aligned_row_width = input_remaining;
+        if (output_remaining < output_img_width) {
+            bottom_partial_left_aligned_row_width = std::min(input_remaining, output_remaining * img_stride_w);
+        }
+        // sanity
+        TT_ASSERT(bottom_partial_left_aligned_row_width < img_width);
+        TT_ASSERT(v.next_img_h >= v.img_h);
+        skip_bottom_partial_left_aligned_row = (v.next_img_h == v.img_h) ? 0 : 1;
+        while (v.img_w < bottom_partial_left_aligned_row_width) {
+            v.img_w += 1;
+            if (v.next_img_w < v.img_w) {
+                v.next_img_w += img_stride_w;
+                TT_ASSERT(v.next_img_w < img_width);  // odd widths and odd size sharding with stride > 1 not supported
+            }
+        }
+        TT_ASSERT(v.img_w == bottom_partial_left_aligned_row_width && v.next_img_w >= v.img_w);
+        v.input_flat_h += bottom_partial_left_aligned_row_width;
+        if (!skip_bottom_partial_left_aligned_row) {
+            v.output_flat_h += std::ceil((double)bottom_partial_left_aligned_row_width / (double)img_stride_w);
+        }
+    }
+    TT_ASSERT(v.img_h < img_height && v.img_w < img_width);
+
+    if (0) {
+        log_debug(tt::LogOp, "   top_partial_middle_aligned_row_width: {}", top_partial_middle_aligned_row_width);
+        log_debug(tt::LogOp, "   skip_top_partial_middle_aligned_row: {}", skip_top_partial_middle_aligned_row);
+        log_debug(tt::LogOp, "   top_partial_right_aligned_row_width: {}", top_partial_right_aligned_row_width);
+        log_debug(tt::LogOp, "   skip_top_partial_right_aligned_row: {}", skip_top_partial_right_aligned_row);
+        log_debug(tt::LogOp, "   num_rows_top_partial_image: {}", num_rows_top_partial_image);
+        log_debug(tt::LogOp, "   num_skip_rows_top_partial_image: {}", num_skip_rows_top_partial_image);
+        log_debug(tt::LogOp, "   num_full_images: {}", num_full_images);
+        log_debug(tt::LogOp, "   num_rows_bottom_partial_image: {}", num_rows_bottom_partial_image);
+        log_debug(tt::LogOp, "   num_skip_rows_bottom_partial_image: {}", num_skip_rows_bottom_partial_image);
+        log_debug(tt::LogOp, "   bottom_partial_left_aligned_row_width: {}", bottom_partial_left_aligned_row_width);
+        log_debug(tt::LogOp, "   skip_bottom_partial_left_aligned_row: {}", skip_bottom_partial_left_aligned_row);
+        log_debug(tt::LogOp, "   v.input_flat_h: {}", v.input_flat_h);
+        log_debug(tt::LogOp, "   v.output_flat_h: {}", v.output_flat_h);
+        log_debug(tt::LogOp, "   input_end_flat_h: {}", input_end_flat_h);
+        log_debug(tt::LogOp, "   output_end_flat_h: {}", output_end_flat_h);
+        // cout << "img_h=" << v.img_h << ", img_w=" << v.img_w << ", next_img_h=" << v.next_img_h << ", next_img_w=" <<
+        // v.img_w << endl;
+    }
+
+    // Sanity check
+    TT_ASSERT(v.input_flat_h <= input_end_flat_h + 1);
+    TT_ASSERT(v.output_flat_h <= output_end_flat_h + 1);
+
+    if (v.input_flat_h == input_end_flat_h + 1) {
+        v.input_flat_h = 0;
+    }
+    if (v.output_flat_h == output_end_flat_h + 1) {
+        v.output_flat_h = 0;
+    }
+    return DownsampleReadPatternParams{
+        .top_partial_middle_aligned_row_width = top_partial_middle_aligned_row_width,
+        .skip_top_partial_middle_aligned_row = skip_top_partial_middle_aligned_row,
+        .top_partial_right_aligned_row_width = top_partial_right_aligned_row_width,
+        .skip_top_partial_right_aligned_row = skip_top_partial_right_aligned_row,
+        .num_rows_top_partial_image = num_rows_top_partial_image,
+        .num_skip_rows_top_partial_image = num_skip_rows_top_partial_image,
+        .num_full_images = num_full_images,
+        .num_rows_bottom_partial_image = num_rows_bottom_partial_image,
+        .num_skip_rows_bottom_partial_image = num_skip_rows_bottom_partial_image,
+        .bottom_partial_left_aligned_row_width = bottom_partial_left_aligned_row_width,
+        .skip_bottom_partial_left_aligned_row = skip_bottom_partial_left_aligned_row};
+}
+
+
+operation::ProgramWithCallbacks downsample_single_core(
+    const Tensor& a, std::array<uint32_t, 5> downsample_params, Tensor& output) {
+              tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+    tt::DataFormat untilized_cb_data_format = tt::DataFormat::Float16_b;
+    uint32_t untilized_single_tile_size = tt::tt_metal::detail::TileSize(untilized_cb_data_format);
+    auto [img_batch_size, img_height, img_width, img_stride_h, img_stride_w] = downsample_params;
+    tt::tt_metal::Buffer* src0_buffer = a.buffer();
+
+    TT_ASSERT(a.get_legacy_shape()[0] == 1 && a.get_legacy_shape()[1] == 1);
+    TT_ASSERT(output.get_legacy_shape()[0] == 1 && output.get_legacy_shape()[1] == 1);
+
+    tt::tt_metal::Device* device = a.device();
+
+    tt::tt_metal::Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+    // Sanity check of output size
+    TT_ASSERT(output.volume() % TILE_HW == 0);
+    uint32_t unpadded_input_volume = img_batch_size * img_height * img_width;
+    TT_ASSERT(a.volume() >= unpadded_input_volume);
+    uint32_t unpadded_output_volume = ceil((double)unpadded_input_volume / (double)(img_stride_h * img_stride_w));
+    TT_ASSERT(output.volume() >= unpadded_output_volume);
+
+    uint32_t ncores_x_full_grid = device->compute_with_storage_grid_size().x;
+    auto [num_cores_height_sliced, num_cores_width_sliced] = get_num_cores_height_width_sliced(
+        a.shard_spec().value().grid, a.memory_config().memory_layout, a.shard_spec().value().orientation);
+    uint32_t num_cores = num_cores_height_sliced * num_cores_width_sliced;
+    auto all_cores = a.shard_spec().value().grid;
+    auto memory_layout = a.memory_config().memory_layout;
+    TT_ASSERT(all_cores == output.shard_spec().value().grid);
+    TT_ASSERT(memory_layout == output.memory_config().memory_layout);
+    TT_ASSERT(
+        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || memory_layout == TensorMemoryLayout::BLOCK_SHARDED);
+    if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
+        TT_ASSERT(all_cores.ranges().size() == 1);
+    } else {
+        TT_ASSERT(num_cores_width_sliced == 1);
+    }
+    uint32_t num_cores_x =
+        memory_layout == TensorMemoryLayout::HEIGHT_SHARDED ? ncores_x_full_grid : num_cores_height_sliced;
+
+    auto core_range = all_cores;
+
+    uint32_t input_height =
+        a.get_legacy_shape()[2];  // input height == flattened face of input image, multiple images are stacked in H dim
+    uint32_t input_width = a.get_legacy_shape()[3];         // input width == input image # of channels
+    uint32_t output_height = output.get_legacy_shape()[2];  // output height == flattened face of output image, multiple
+                                                            // images are stacked in H dim
+    uint32_t output_width = output.get_legacy_shape()[3];
+    TT_ASSERT(input_width == output_width);
+
+    uint32_t input_height_unpadded = img_batch_size * img_height * img_width;
+    TT_ASSERT(input_height >= input_height_unpadded);
+    uint32_t output_height_unpadded =
+        img_batch_size * std::ceil((double)(img_height * img_width) / (double)(img_stride_h * img_stride_w));
+    TT_ASSERT(output_height >= output_height_unpadded);
+
+    uint32_t input_shard_height = a.shard_spec().value().shape[0];
+    TT_ASSERT(input_shard_height * num_cores_height_sliced >= input_height);  // last core shard size may be padded
+    uint32_t input_height_padded = input_shard_height * num_cores_height_sliced;
+    uint32_t input_shard_width = a.shard_spec().value().shape[1];
+    TT_ASSERT(input_shard_width * num_cores_width_sliced == input_width);
+
+    uint32_t input_height_padding = input_height_padded - input_height_unpadded;
+    TT_ASSERT(input_height_padding < input_shard_height);  // last core has padding
+    uint32_t last_core_input_shard_height_unpadded = input_shard_height - input_height_padding;
+
+    uint32_t output_shard_height = output.shard_spec().value().shape[0];
+    uint32_t output_shard_width = output.shard_spec().value().shape[1];
+
+    TT_ASSERT(output_shard_width == input_shard_width);
+    TT_ASSERT(output_shard_height * num_cores_height_sliced >= output_height);  // last core shard size may be padded
+    uint32_t output_height_padded = output_shard_height * num_cores_height_sliced;
+    uint32_t output_height_padding = output_height_padded - output_height_unpadded;
+    TT_ASSERT(output_height_padding < output_shard_height);
+    uint32_t last_core_output_shard_height_unpadded = output_shard_height - output_height_padding;
+
+    uint32_t input_shard_width_bytes = input_shard_width * datum_size(untilized_cb_data_format);
+
+    TT_ASSERT(input_shard_width % TILE_WIDTH == 0);
+    uint32_t num_input_tiles_in_row = input_shard_width / TILE_WIDTH;
+    TT_ASSERT(input_shard_height % TILE_HEIGHT == 0);
+    uint32_t num_rows_of_input_tiles = input_shard_height / TILE_HEIGHT;
+
+    uint32_t num_output_tiles_in_row = num_input_tiles_in_row;
+    TT_ASSERT(output_shard_height % TILE_HEIGHT == 0);
+    uint32_t num_rows_of_output_tiles = output_shard_height / TILE_HEIGHT;
+
+    uint32_t input_cb_index = tt::CB::c_in0;
+    uint32_t num_input_tiles = num_input_tiles_in_row * num_rows_of_input_tiles;
+    tt::tt_metal::CircularBufferConfig input_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_input_tiles * input_single_tile_size, {{input_cb_index, input_cb_data_format}})
+            .set_page_size(input_cb_index, input_single_tile_size);
+    input_cb_config = input_cb_config.set_globally_allocated_address(*a.buffer());
+    auto input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, input_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        input_cb_index,
+        input_single_tile_size,
+        num_input_tiles,
+        input_single_tile_size * num_input_tiles);
+
+    // CB to store halo data
+    // hardcode to store 1 row of tiles
+    uint32_t halo_prev_input_cb_index = tt::CB::c_in1;
+    uint32_t halo_prev_input_cb_max_rows_of_tiles = 4;
+    uint32_t num_halo_prev_cb_input_tiles = num_input_tiles_in_row * halo_prev_input_cb_max_rows_of_tiles;
+    tt::tt_metal::CircularBufferConfig halo_prev_input_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_halo_prev_cb_input_tiles * input_single_tile_size, {{halo_prev_input_cb_index, input_cb_data_format}})
+            .set_page_size(halo_prev_input_cb_index, input_single_tile_size);
+    auto halo_prev_input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, halo_prev_input_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        halo_prev_input_cb_index,
+        input_single_tile_size,
+        num_halo_prev_cb_input_tiles,
+        input_single_tile_size * num_halo_prev_cb_input_tiles);
+
+    uint32_t halo_next_input_cb_index = tt::CB::c_in2;
+    uint32_t halo_next_input_cb_max_rows_of_tiles = 33;  // TODO: Remove hardcoding
+    uint32_t num_halo_next_cb_input_tiles = num_input_tiles_in_row * halo_next_input_cb_max_rows_of_tiles;
+    tt::tt_metal::CircularBufferConfig halo_next_input_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_halo_next_cb_input_tiles * input_single_tile_size, {{halo_next_input_cb_index, input_cb_data_format}})
+            .set_page_size(halo_next_input_cb_index, input_single_tile_size);
+    auto halo_next_input_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, halo_next_input_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        halo_next_input_cb_index,
+        input_single_tile_size,
+        num_halo_next_cb_input_tiles,
+        input_single_tile_size * num_halo_next_cb_input_tiles);
+
+    // CB to store reader pattern array
+    // read pattern array size == output_height
+    uint32_t reader_pattern_array_size = output_shard_height;
+    uint32_t reader_pattern_array_cb_index = tt::CB::c_intermed1;
+    tt::tt_metal::CircularBufferConfig reader_pattern_array_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            reader_pattern_array_size * 4, {{reader_pattern_array_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(reader_pattern_array_cb_index, 4);
+    auto reader_pattern_array_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, reader_pattern_array_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        reader_pattern_array_cb_index,
+        4,
+        reader_pattern_array_size,
+        4 * reader_pattern_array_size);
+
+    // untilized CB has size - [32, full width]
+    uint32_t untilize_cb_index = tt::CB::c_intermed2;
+    uint32_t num_tiles_untilize_cb = num_input_tiles_in_row;
+    tt::tt_metal::CircularBufferConfig untilize_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_tiles_untilize_cb * untilized_single_tile_size, {{untilize_cb_index, untilized_cb_data_format}})
+            .set_page_size(untilize_cb_index, untilized_single_tile_size);
+    auto untilize_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, untilize_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        untilize_cb_index,
+        untilized_single_tile_size,
+        num_tiles_untilize_cb,
+        untilized_single_tile_size * num_tiles_untilize_cb);
+
+    uint32_t num_output_tiles = num_output_tiles_in_row * num_rows_of_output_tiles;
+    uint32_t untilize_downsampled_cb_index = tt::CB::c_intermed3;
+    uint32_t num_tiles_untilize_downsampled_cb =
+        num_output_tiles;  // untilize downsampled cb size == output size per core
+    tt::tt_metal::CircularBufferConfig untilize_downsampled_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_tiles_untilize_downsampled_cb * untilized_single_tile_size,
+            {{untilize_downsampled_cb_index, untilized_cb_data_format}})
+            .set_page_size(untilize_downsampled_cb_index, untilized_single_tile_size);
+    auto untilize_downsampled_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, untilize_downsampled_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        untilize_downsampled_cb_index,
+        untilized_single_tile_size,
+        num_tiles_untilize_downsampled_cb,
+        untilized_single_tile_size * num_tiles_untilize_downsampled_cb);
+
+    uint32_t final_tilize_output_cb_index = tt::CB::c_out0;
+    uint32_t num_tiles_final_tilize_output_cb = num_output_tiles;  // final output cb size == output size per core
+    tt::tt_metal::CircularBufferConfig final_tilize_output_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_tiles_final_tilize_output_cb * output_single_tile_size,
+            {{final_tilize_output_cb_index, output_cb_data_format}})
+            .set_page_size(final_tilize_output_cb_index, output_single_tile_size);
+    final_tilize_output_cb_config = final_tilize_output_cb_config.set_globally_allocated_address(*output.buffer());
+    auto final_tilize_output_cb = tt::tt_metal::CreateCircularBuffer(program, core_range, final_tilize_output_cb_config);
+    log_debug(
+        tt::LogOp,
+        "CB {}: PS = {} NP = {} :: TOTAL = {}",
+        final_tilize_output_cb_index,
+        output_single_tile_size,
+        num_tiles_final_tilize_output_cb,
+        output_single_tile_size * num_tiles_final_tilize_output_cb);
+
+    uint32_t log_base_2_of_conv_act_size_c_bytes = (uint32_t)std::log2((float)input_shard_width_bytes);
+    uint32_t stride_h_x_image_width = img_stride_h * img_width;
+    std::vector<uint32_t> writer_compile_time_args = {
+        (std::uint32_t)untilize_cb_index,
+        (std::uint32_t)untilize_downsampled_cb_index,
+        (std::uint32_t)final_tilize_output_cb_index,
+        (std::uint32_t)reader_pattern_array_cb_index,
+        (std::uint32_t)datum_size(untilized_cb_data_format),
+        (std::uint32_t)input_shard_width_bytes,
+        (std::uint32_t)halo_prev_input_cb_index,
+        (std::uint32_t)halo_next_input_cb_index,
+        log_base_2_of_conv_act_size_c_bytes,
+        stride_h_x_image_width};
+
+    // Writer to downsample - drops rows from untilized cb
+    tt::tt_metal::KernelHandle downsample_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_writer_kernel.cpp",
+        core_range,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    vector<uint32_t> compute_args = {
+        input_cb_index,
+        halo_prev_input_cb_index,
+        halo_next_input_cb_index,
+        untilize_cb_index,
+        untilize_downsampled_cb_index,
+        final_tilize_output_cb_index,
+        num_input_tiles_in_row,
+        num_rows_of_output_tiles,
+        num_output_tiles_in_row,
+    };
+    string compute_kernel = "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_compute_kernel.cpp";
+    if (num_input_tiles_in_row <= MAX_PACK_UNTILIZE_WIDTH) {
+        compute_kernel =
+            "ttnn/cpp/ttnn/operations/data_movement/downsample/device/kernels/downsample_fast_pack_untilize_compute_kernel.cpp";
+    }
+    auto downsample_compute_kernel_id = tt::tt_metal::CreateKernel(
+        program, compute_kernel, core_range, tt::tt_metal::ComputeConfig{.compile_args = compute_args});
+
+    // track img h, img w, across cores
+    ImgTrackingVars v;
+    CoreCoord prev_core = {0, 0};
+    bool input_flat_h_is_of_current_core = true;
+    const auto& cores = corerange_to_cores(all_cores, std::nullopt, true);
+    for (uint32_t i = 0; i < num_cores; i++) {
+        const CoreCoord& core = cores[i];
+        // cout << "i=" << i << endl;
+
+        uint32_t input_end_flat_h = input_shard_height - 1;
+        uint32_t output_end_flat_h = output_shard_height - 1;
+
+        bool halo_prev_read_enabled = false;
+        DownsampleReadPatternParams halo_prev_read_pattern_params;
+        uint32_t halo_prev_noc_x = 0;
+        uint32_t halo_prev_noc_y = 0;
+        uint32_t halo_prev_start_addr = 0;
+        uint32_t halo_prev_addr_offset = 0;
+        uint32_t halo_prev_num_tiles = 0;
+        uint32_t halo_prev_size_bytes = 0;
+        uint32_t halo_prev_input_num_rows_of_tiles = 0;
+        uint32_t halo_prev_read_pattern_offset = 0;
+
+        bool halo_next_read_enabled = false;
+        DownsampleReadPatternParams halo_next_read_pattern_params;
+        uint32_t halo_next_noc_x = 0;
+        uint32_t halo_next_noc_y = 0;
+        uint32_t halo_next_start_addr = 0;
+        uint32_t halo_next_addr_offset = 0;
+        uint32_t halo_next_num_tiles = 0;
+        uint32_t halo_next_size_bytes = 0;
+        uint32_t halo_next_input_num_rows_of_tiles = 0;
+        uint32_t halo_next_read_pattern_offset = 0;
+        uint32_t local_read_pattern_offset = 0;
+        uint32_t current_core_input_end_flat_h = input_end_flat_h;
+        uint32_t next_core_input_end_flat_h = input_end_flat_h;
+        if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
+            if (i % num_cores_x == 0) {
+                // first core in row
+                // reset
+                v.input_flat_h = 0;
+                v.output_flat_h = 0;
+            } else if (i % num_cores_x == num_cores_x - 1) {
+                // set unpadded height as end index for last core in row
+                current_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
+                output_end_flat_h = last_core_output_shard_height_unpadded - 1;
+            } else if (i % num_cores_x == num_cores_x - 2) {
+                next_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
+            }
+        } else if (i == num_cores - 1) {
+            // for height sharding, set unpadded height as end index for last core
+            current_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
+            output_end_flat_h = last_core_output_shard_height_unpadded - 1;
+        } else if (i == num_cores - 2) {
+            next_core_input_end_flat_h = last_core_input_shard_height_unpadded - 1;
+        }
+        if (v.input_flat_h != 0 && !input_flat_h_is_of_current_core) {
+            // halo region of previous core
+            TT_ASSERT(i != 0);
+            if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
+                TT_ASSERT(prev_core.y == core.y);  // for block sharding case, prev core is left core
+            }
+            halo_prev_read_enabled = true;
+            TT_ASSERT(v.input_flat_h < input_shard_height);
+            // get halo start tile address from height idx
+            uint32_t halo_prev_start_tile_id_h = v.input_flat_h / TILE_HEIGHT;
+            TT_ASSERT(
+                input_shard_height - v.input_flat_h <=
+                TILE_HEIGHT *
+                    halo_prev_input_cb_max_rows_of_tiles);  // halo input cb is hardcoded to store only 4 rows of tiles
+                                                            // for now. TODO: allocate bigger CB or read in blocks
+            // get halo size
+            halo_prev_size_bytes = (input_shard_height - (halo_prev_start_tile_id_h * TILE_HEIGHT)) *
+                                   input_shard_width / TILE_HW * input_single_tile_size;
+            TT_ASSERT(halo_prev_size_bytes % input_single_tile_size == 0);
+            halo_prev_num_tiles = halo_prev_size_bytes / input_single_tile_size;
+            TT_ASSERT(halo_prev_num_tiles <= num_halo_prev_cb_input_tiles);
+            TT_ASSERT(halo_prev_num_tiles % num_input_tiles_in_row == 0);
+            halo_prev_input_num_rows_of_tiles = halo_prev_num_tiles / num_input_tiles_in_row;
+            halo_prev_addr_offset = num_input_tiles_in_row * halo_prev_start_tile_id_h * input_single_tile_size;
+            halo_prev_start_addr = GetCircularBufferConfig(program, input_cb).globally_allocated_address().value();
+
+            TT_ASSERT(
+                (halo_prev_start_addr + halo_prev_addr_offset) % 32 == 0);  // read address should be 32 byte aligned
+            auto halo_noc_coords = device->worker_core_from_logical_core(prev_core);
+            halo_prev_noc_x = halo_noc_coords.x;
+            halo_prev_noc_y = halo_noc_coords.y;
+            TT_ASSERT(v.input_flat_h >= halo_prev_start_tile_id_h * TILE_HEIGHT);
+            halo_prev_read_pattern_offset = v.input_flat_h - (halo_prev_start_tile_id_h * TILE_HEIGHT);
+            local_read_pattern_offset = halo_prev_input_num_rows_of_tiles * TILE_HEIGHT;
+            halo_prev_read_pattern_params = generate_downsample_read_pattern(
+                v, img_height, img_width, img_stride_h, img_stride_w, input_end_flat_h, output_end_flat_h, true, false);
+        }
+        // local core
+        TT_ASSERT(v.output_flat_h < output_shard_height);
+        uint32_t local_start_h = v.input_flat_h;
+        DownsampleReadPatternParams local_read_pattern_params = generate_downsample_read_pattern(
+            v,
+            img_height,
+            img_width,
+            img_stride_h,
+            img_stride_w,
+            current_core_input_end_flat_h,
+            output_end_flat_h,
+            false,
+            false);
+        TT_ASSERT(v.output_flat_h <= output_shard_height);
+        uint32_t local_end_h_exclusive = v.input_flat_h == 0 ? input_shard_height : v.input_flat_h;
+        uint32_t local_num_rows = local_end_h_exclusive - local_start_h;
+        TT_ASSERT(local_num_rows > 0);
+        uint32_t local_input_num_rows_of_tiles = std::ceil((double)local_num_rows / (double)TILE_HEIGHT);
+        uint32_t local_input_offset_rows_of_tiles = local_start_h / TILE_HEIGHT;
+        if (local_start_h != 0) {
+            TT_ASSERT(local_read_pattern_offset == 0);
+            local_read_pattern_offset = local_start_h % TILE_HEIGHT;
+        }
+        if (v.input_flat_h != 0) {
+            input_flat_h_is_of_current_core = false;
+        } else {
+            input_flat_h_is_of_current_core = true;  // updating flag for next core
+        }
+        TT_ASSERT(local_input_num_rows_of_tiles <= num_rows_of_input_tiles);
+
+        if (v.output_flat_h != 0) {
+            // need to read halo from next core
+            TT_ASSERT(i != num_cores - 1);
+            TT_ASSERT(v.input_flat_h == 0);
+            const CoreCoord& next_core = cores[i + 1];
+            if (memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
+                TT_ASSERT(next_core.y == core.y);  // for block sharding case, next core is right core
+            }
+            halo_next_read_enabled = true;
+            halo_next_read_pattern_params = generate_downsample_read_pattern(
+                v,
+                img_height,
+                img_width,
+                img_stride_h,
+                img_stride_w,
+                next_core_input_end_flat_h,
+                output_end_flat_h,
+                false,
+                true);
+            TT_ASSERT(v.output_flat_h == 0);
+            TT_ASSERT(v.input_flat_h != 0 && v.input_flat_h < input_shard_height);
+            TT_ASSERT(
+                v.input_flat_h <= TILE_HEIGHT * halo_next_input_cb_max_rows_of_tiles,
+                "v.input_flat_h ({}) should be <= TILE_HEIGHT * halo_next_input_cb_max_rows_of_tiles ({})",
+                v.input_flat_h,
+                halo_next_input_cb_max_rows_of_tiles);  // halo next input cb is hardcoded to store only 5 rows of tiles
+                                                        // for now. TODO: allocate bigger CB or read in blocks
+            uint32_t halo_next_end_tile_id_h = v.input_flat_h / TILE_HEIGHT;
+            // get halo size
+            halo_next_size_bytes =
+                (halo_next_end_tile_id_h + 1) * TILE_HEIGHT * input_shard_width / TILE_HW * input_single_tile_size;
+            TT_ASSERT(halo_next_size_bytes % input_single_tile_size == 0);
+            halo_next_num_tiles = halo_next_size_bytes / input_single_tile_size;
+            TT_ASSERT(halo_next_num_tiles <= num_halo_next_cb_input_tiles);
+            TT_ASSERT(halo_next_num_tiles % num_input_tiles_in_row == 0);
+            halo_next_input_num_rows_of_tiles = halo_next_num_tiles / num_input_tiles_in_row;
+            halo_next_addr_offset = 0;
+            halo_next_start_addr = GetCircularBufferConfig(program, input_cb).globally_allocated_address().value();
+            TT_ASSERT(
+                (halo_next_start_addr + halo_next_addr_offset) % 32 == 0);  // read address should be 32 byte aligned
+            auto halo_noc_coords = device->worker_core_from_logical_core(next_core);
+            halo_next_noc_x = halo_noc_coords.x;
+            halo_next_noc_y = halo_noc_coords.y;
+            TT_ASSERT(halo_prev_input_num_rows_of_tiles == 0);
+            halo_next_read_pattern_offset = local_input_num_rows_of_tiles * TILE_HEIGHT;
+        }
+        TT_ASSERT(v.output_flat_h == 0);
+
+        // Compile runtime args
+        vector<uint32_t> compile_rt_kernel_args = {
+            local_input_num_rows_of_tiles,
+            local_input_offset_rows_of_tiles,
+            halo_prev_read_enabled,
+            halo_prev_input_num_rows_of_tiles,
+            halo_next_read_enabled,
+            halo_next_input_num_rows_of_tiles,
+        };
+
+        tt::tt_metal::SetRuntimeArgs(program, downsample_compute_kernel_id, core, compile_rt_kernel_args);
+
+        // Writer runtime args
+        vector<uint32_t> writer_kernel_args = {
+            (uint32_t)img_height,
+            (uint32_t)img_width,
+            (uint32_t)img_stride_h,
+            (uint32_t)img_stride_w,
+
+            // halo prev args
+            halo_prev_read_enabled,
+            halo_prev_noc_x,
+            halo_prev_noc_y,
+            halo_prev_num_tiles,
+            halo_prev_start_addr,
+            halo_prev_addr_offset,
+            halo_prev_size_bytes,
+
+            // halo prev read pattern args
+            halo_prev_read_pattern_offset,
+            halo_prev_read_pattern_params.top_partial_middle_aligned_row_width,
+            halo_prev_read_pattern_params.skip_top_partial_middle_aligned_row,
+            halo_prev_read_pattern_params.top_partial_right_aligned_row_width,
+            halo_prev_read_pattern_params.skip_top_partial_right_aligned_row,
+            halo_prev_read_pattern_params.num_rows_top_partial_image,
+            halo_prev_read_pattern_params.num_skip_rows_top_partial_image,
+            halo_prev_read_pattern_params.num_full_images,
+            halo_prev_read_pattern_params.num_rows_bottom_partial_image,
+            halo_prev_read_pattern_params.num_skip_rows_bottom_partial_image,
+            halo_prev_read_pattern_params.bottom_partial_left_aligned_row_width,
+            halo_prev_read_pattern_params.skip_bottom_partial_left_aligned_row,
+
+            // local read pattern args
+            local_read_pattern_offset,
+            local_read_pattern_params.top_partial_middle_aligned_row_width,
+            local_read_pattern_params.skip_top_partial_middle_aligned_row,
+            local_read_pattern_params.top_partial_right_aligned_row_width,
+            local_read_pattern_params.skip_top_partial_right_aligned_row,
+            local_read_pattern_params.num_rows_top_partial_image,
+            local_read_pattern_params.num_skip_rows_top_partial_image,
+            local_read_pattern_params.num_full_images,
+            local_read_pattern_params.num_rows_bottom_partial_image,
+            local_read_pattern_params.num_skip_rows_bottom_partial_image,
+            local_read_pattern_params.bottom_partial_left_aligned_row_width,
+            local_read_pattern_params.skip_bottom_partial_left_aligned_row,
+
+            // halo next core args
+            halo_next_read_enabled,
+            halo_next_noc_x,
+            halo_next_noc_y,
+            halo_next_num_tiles,
+            halo_next_start_addr,
+            halo_next_addr_offset,
+            halo_next_size_bytes,
+
+            // halo next read pattern args
+            halo_next_read_pattern_offset,
+            halo_next_read_pattern_params.top_partial_middle_aligned_row_width,
+            halo_next_read_pattern_params.skip_top_partial_middle_aligned_row,
+            halo_next_read_pattern_params.top_partial_right_aligned_row_width,
+            halo_next_read_pattern_params.skip_top_partial_right_aligned_row,
+            halo_next_read_pattern_params.num_rows_top_partial_image,
+            halo_next_read_pattern_params.num_skip_rows_top_partial_image,
+            halo_next_read_pattern_params.num_full_images,
+            halo_next_read_pattern_params.num_rows_bottom_partial_image,
+            halo_next_read_pattern_params.num_skip_rows_bottom_partial_image,
+            halo_next_read_pattern_params.bottom_partial_left_aligned_row_width,
+            halo_next_read_pattern_params.skip_bottom_partial_left_aligned_row,
+
+            halo_prev_input_num_rows_of_tiles + local_input_num_rows_of_tiles + halo_next_input_num_rows_of_tiles,
+            num_input_tiles_in_row,
+            num_output_tiles,
+
+            (uint32_t) false};
+
+        tt::tt_metal::SetRuntimeArgs(program, downsample_writer_kernel_id, core, writer_kernel_args);
+        prev_core = core;
+    }
+
+    auto override_runtime_args_callback = [input_cb = input_cb,
+                                           final_tilize_output_cb = final_tilize_output_cb,
+                                           downsample_writer_kernel_id = downsample_writer_kernel_id,
+                                           cores = cores](
+                                              const void* operation,
+                                              Program& program,
+                                              const std::vector<Tensor>& input_tensors,
+                                              const std::vector<std::optional<const Tensor>>& optional_input_tensors,
+                                              const std::vector<Tensor>& output_tensors) {
+        auto src_buffer = input_tensors.at(0).buffer();
+        auto dst_buffer = output_tensors.at(0).buffer();
+
+        UpdateDynamicCircularBufferAddress(program, input_cb, *src_buffer);
+        UpdateDynamicCircularBufferAddress(program, final_tilize_output_cb, *dst_buffer);
+
+        auto& writer_runtime_args_by_core = GetRuntimeArgs(program, downsample_writer_kernel_id);
+        for (const auto& core : cores) {
+            auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
+            runtime_args[8] = src_buffer->address();
+            runtime_args[39] = src_buffer->address();
+        }
+    };
+
+    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
+}
+
+}  // namespace ttnn::operations::data_movement::detail 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.hpp
new file mode 100644
index 00000000000..ef32eb54942
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/downsample/device/downsample_program_factory.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+
+#include "tt_metal/host_api.hpp"
+
+using namespace tt::constants;
+
+namespace ttnn::operations::data_movement::detail {
+
+std::pair<uint32_t, uint32_t> get_num_cores_height_width_sliced(
+    CoreRangeSet all_cores, TensorMemoryLayout memory_layout, ShardOrientation shard_orientation); 
+operation::ProgramWithCallbacks downsample_single_core(
+    const Tensor& a, std::array<uint32_t, 5> downsample_params, Tensor& output);
+
+}  // namespace ttnn::operations::data_movement::detail 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
new file mode 100644
index 00000000000..44a2d903705
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -0,0 +1,1127 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/tensor/host_buffer/functions.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_log.h"
+using namespace tt::constants;
+
+namespace ttnn::operations::data_movement::detail {
+
+
+operation::ProgramWithCallbacks pad_rm_reader_writer(const Tensor &a,
+                                                     Tensor &output,
+                                                     const tt::tt_metal::Shape &output_tensor_shape,
+                                                     const tt::tt_metal::Shape &input_tensor_start,
+                                                     const float pad_value) {
+    Program program{};
+
+    auto output_shape = output_tensor_shape;
+
+    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
+    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
+    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
+
+    // construct const buffer with the pad_value
+    Device *device = a.device();
+    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
+    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
+    auto pad_value_const_buffer = tt::tt_metal::owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
+    const Tensor pad_value_const_tensor =
+        Tensor(
+            OwnedStorage{pad_value_const_buffer},
+            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
+            DataType::BFLOAT16,
+            Layout::ROW_MAJOR)
+            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
+    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
+
+    CoreRange cores({0, 0}, {0, 0});
+    uint32_t cb_id = tt::CB::c_in0;
+    uint32_t cb_npages = 16; // multibuffering
+    uint32_t cb_pagesize = tt::round_up(padded_row_size_nbytes, tt::constants::TILE_WIDTH);
+    tt::DataFormat in_df = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(cb_npages * cb_pagesize, {{cb_id, in_df}})
+		.set_page_size(cb_id, cb_pagesize);
+    auto cb = tt::tt_metal::CreateCircularBuffer(program, cores, cb_config);
+
+
+    Buffer *src0_buffer = a.buffer();
+    Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
+    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
+    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
+    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
+    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
+                                            (std::uint32_t) dst_is_dram,
+                                            (std::uint32_t) src_stick_size_is_power_of_two,
+                                            (std::uint32_t) src_log2_stick_size,
+                                            (std::uint32_t) dst_stick_size_is_power_of_two,
+                                            (std::uint32_t) dst_log2_stick_size};
+    std::vector<uint32_t> writer_ct_args = reader_ct_args;
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    bfloat16 bfloat_zero = bfloat16(0.0f);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
+
+    KernelHandle reader_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp",
+                                                        cores,
+                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
+    KernelHandle writer_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp",
+                                                        cores,
+                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
+    uint32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
+
+    #if 0
+    {
+        log_debug("src0_buffer_addr: {}", src0_buffer->address());
+        log_debug("dst_buffer_addr: {}", dst_buffer->address());
+        log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
+        log_debug("out.shape[0]: {}", output_shape[0]);
+        log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
+        log_debug("out.shape[1]: {}", output_shape[1]);
+        log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
+        log_debug("out.shape[2]: {}", output_shape[2]);
+        log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
+        log_debug("out.shape[3]: {}", output_shape[3]);
+        log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
+        log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
+        log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
+        log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
+        log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
+        log_debug("packed_pad_value: {}", packed_pad_value);
+    }
+    #endif
+
+    uint32_t start_src_stick_id = 0;
+    uint32_t start_dst_stick_id = 0;
+    vector<uint32_t> reader_rt_args = {src0_buffer->address(),
+                                       dst_buffer->address(),
+                                       a.get_legacy_shape()[0],
+                                       output_shape[0],
+                                       a.get_legacy_shape()[1],
+                                       output_shape[1],
+                                       a.get_legacy_shape()[2],
+                                       output_shape[2],
+                                       a.get_legacy_shape()[3],
+                                       output_shape[3],
+                                       unpadded_row_size_nbytes,
+                                       padded_row_size_nbytes,
+                                       padded_row_diff_size_nbytes,
+                                       pad_value_const_tensor_addr,
+                                       pad_value_const_buffer_nbytes,
+                                       packed_pad_value,
+                                       start_src_stick_id,
+                                       start_dst_stick_id,
+                                       0,
+                                       0,
+                                       0,
+                                       output_shape[2],
+                                       a.get_legacy_shape()[2],
+                                       unpadded_row_size_nbytes,
+                                       padded_row_size_nbytes,
+                                       0,
+                                       output.get_legacy_shape()[0]
+                                       };
+    vector<uint32_t> writer_rt_args = reader_rt_args;
+    tt::tt_metal::SetRuntimeArgs(program,
+                   reader_kernel_id,
+                   cores,
+                   reader_rt_args);
+    tt::tt_metal::SetRuntimeArgs(program,
+                   writer_kernel_id,
+                   cores,
+                   writer_rt_args);
+
+    auto override_runtime_args_callback =
+        [reader_kernel_id=reader_kernel_id, writer_kernel_id=writer_kernel_id](
+            const Program &program,
+            const std::vector<Buffer*>& input_buffers,
+            const std::vector<Buffer*>& output_buffers) {
+        auto src_buffer = input_buffers.at(0);
+        auto dst_buffer = output_buffers.at(0);
+        CoreCoord core = {0, 0};
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, reader_kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+            runtime_args[1] = dst_buffer->address();
+        }
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, writer_kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+            runtime_args[1] = dst_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a,
+                                           Tensor &output,
+                                           const Shape &output_tensor_shape,
+                                           const Shape &input_tensor_start,
+                                           const float pad_value) {
+    Program program{};
+
+    auto output_shape = output_tensor_shape;
+
+    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
+    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
+    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
+
+    Device *device = a.device();
+    auto dst_buffer_l1 = Buffer(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1);
+
+    // construct const buffer with the pad_value
+    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
+    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
+    auto pad_value_const_buffer = owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
+    const Tensor pad_value_const_tensor =
+        Tensor(
+            OwnedStorage{pad_value_const_buffer},
+            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
+            DataType::BFLOAT16,
+            Layout::ROW_MAJOR)
+            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
+    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
+
+    Buffer *src0_buffer = a.buffer();
+    Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
+    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
+    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
+    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
+    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
+                                            (std::uint32_t) dst_is_dram,
+                                            (std::uint32_t) src_stick_size_is_power_of_two,
+                                            (std::uint32_t) src_log2_stick_size,
+                                            (std::uint32_t) dst_stick_size_is_power_of_two,
+                                            (std::uint32_t) dst_log2_stick_size};
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    bfloat16 bfloat_zero = bfloat16(0.0f);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
+
+    CoreRange core({0, 0}, {0, 0});
+    KernelHandle reader_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/pad_dims_rm_interleaved_opt.cpp",
+                                                        core,
+                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
+    uint32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
+
+    #if 0
+    {
+        tt::log_debug("src0_buffer_addr: {}", src0_buffer->address());
+        tt::log_debug("dst_buffer_addr: {}", dst_buffer->address());
+        tt::log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
+        tt::log_debug("out.shape[0]: {}", output_shape[0]);
+        tt::log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
+        tt::log_debug("out.shape[1]: {}", output_shape[1]);
+        tt::log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
+        tt::log_debug("out.shape[2]: {}", output_shape[2]);
+        tt::log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
+        tt::log_debug("out.shape[3]: {}", output_shape[3]);
+        tt::log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
+        tt::log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
+        tt::log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
+        tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
+        tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
+        tt::log_debug("packed_pad_value: {}", packed_pad_value);
+        tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1.address());
+    }
+    #endif
+
+    vector<uint32_t> reader_rt_args = {src0_buffer->address(),
+                                       dst_buffer->address(),
+                                       a.get_legacy_shape()[0],
+                                       output_shape[0],
+                                       a.get_legacy_shape()[1],
+                                       output_shape[1],
+                                       a.get_legacy_shape()[2],
+                                       output_shape[2],
+                                       a.get_legacy_shape()[3],
+                                       output_shape[3],
+                                       unpadded_row_size_nbytes,
+                                       padded_row_size_nbytes,
+                                       padded_row_diff_size_nbytes,
+                                       pad_value_const_tensor_addr,
+                                       pad_value_const_buffer_nbytes,
+                                       packed_pad_value,
+                                       dst_buffer_l1.address()};
+    tt::tt_metal::SetRuntimeArgs(program,
+                   reader_kernel_id,
+                   core,
+                   reader_rt_args);
+
+    auto override_runtime_args_callback = [kernel_id=reader_kernel_id](const Program &program,
+                                                                             const std::vector<Buffer*>& input_buffers,
+                                                                             const std::vector<Buffer*>& output_buffers) {
+        auto src_buffer = input_buffers.at(0);
+        auto dst_buffer = output_buffers.at(0);
+        CoreCoord core = {0, 0};
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+            runtime_args[1] = dst_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks pad_rm(const Tensor &a, Tensor &output, const Shape &output_tensor_shape, const Shape &input_tensor_start, const float pad_value) {
+
+    tt::tt_metal::Program program{};
+
+    CoreRange core({0, 0}, {0, 0});
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device *device = a.device();
+
+    auto output_shape = output_tensor_shape;
+
+    tt::tt_metal::Buffer *src0_buffer = a.buffer();
+
+    uint32_t unpadded_row_size_bytes = a.get_legacy_shape()[3] * a.element_size();
+    uint32_t padded_row_size_bytes = output_shape[3] * a.element_size();
+
+    tt::tt_metal::Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    uint32_t src_stick_size = unpadded_row_size_bytes;
+    uint32_t dst_stick_size = padded_row_size_bytes;
+
+    uint32_t dst_buffer_size = dst_stick_size;
+
+    tt::tt_metal::InterleavedBufferConfig buff_config{
+                    .device= device,
+                    .size = dst_buffer_size,
+                    .page_size = dst_buffer_size,
+                    .buffer_type = tt::tt_metal::BufferType::L1
+        };
+
+    auto dst_buffer_l1 = tt::tt_metal::CreateBuffer(buff_config);
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    vector<uint32_t> reader_kernel_args = {
+        src0_buffer->address(),
+        dst_buffer->address(),
+        a.get_legacy_shape()[0],
+        output_shape[0],
+        a.get_legacy_shape()[1],
+        output_shape[1],
+        a.get_legacy_shape()[2],
+        output_shape[2],
+        a.get_legacy_shape()[3],
+        output_shape[3],
+        unpadded_row_size_bytes,
+        padded_row_size_bytes,
+        padded_row_size_bytes - unpadded_row_size_bytes,
+        packed_pad_value,
+        dst_buffer_l1->address()
+    };
+    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    bool src_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(src_stick_size);
+    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(src_stick_size) : 0;
+    bool dst_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(dst_stick_size);
+    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(dst_stick_size) : 0;
+    std::vector<uint32_t> compile_time_args_vec = {
+        (std::uint32_t) src0_is_dram,
+        (std::uint32_t) dst_is_dram,
+        (std::uint32_t) src_stick_size_is_power_of_two,
+        (std::uint32_t) src_log2_stick_size,
+        (std::uint32_t) dst_stick_size_is_power_of_two,
+        (std::uint32_t) dst_log2_stick_size,
+
+    };
+
+    // Tilized reader
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/pad_dims_rm_interleaved.cpp",
+        core,
+        tt::tt_metal::ReaderDataMovementConfig(compile_time_args_vec));
+
+    tt::tt_metal::SetRuntimeArgs(
+        program,
+        unary_reader_kernel_id,
+        core,
+        reader_kernel_args
+    );
+
+    auto override_runtime_args_callback = [kernel_id=unary_reader_kernel_id](
+        const Program &program,
+        const std::vector<Buffer*>& input_buffers,
+        const std::vector<Buffer*>& output_buffers
+    ) {
+
+        auto src_buffer = input_buffers.at(0);
+        auto dst_buffer = output_buffers.at(0);
+
+        CoreCoord core = {0, 0};
+
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+            runtime_args[1] = dst_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks pad_tile(const Tensor &a, Tensor& output, const tt::tt_metal::Shape &output_tensor_shape, const tt::tt_metal::Shape &input_tensor_start, const float pad_value) {
+
+    tt::tt_metal::Program program{};
+
+    CoreRange core({0, 0}, {0, 0});
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device *device = a.device();
+
+    auto output_shape = output_tensor_shape;
+
+    tt::tt_metal::Buffer *src0_buffer = a.buffer();
+
+    tt::tt_metal::Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t single_tile_size = tt::tt_metal::detail::TileSize(cb_data_format);
+
+    tt::log_debug("pad_tile");
+    tt::log_debug("cb_data_format: {}", cb_data_format);
+    tt::log_debug("single_tile_size: {}", single_tile_size);
+    tt::log_debug("output_tensor_shape: {}", output_tensor_shape);
+    tt::log_debug("input_tensor_start: {}", input_tensor_start);
+    tt::log_debug("pad_value: {}", pad_value);
+
+    uint32_t src0_cb_index = 0;
+    uint32_t num_input_tiles = 2;
+    tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
+		.set_page_size(src0_cb_index, single_tile_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
+
+    uint32_t src1_cb_index = 1; // For pad buffer
+    uint32_t num_pad_tiles = 1;
+    tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(num_pad_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
+		.set_page_size(src1_cb_index, single_tile_size);
+    auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    uint32_t num_unpadded_Xt = a.get_legacy_shape()[3] / TILE_WIDTH;
+    uint32_t num_total_Xt = output_shape[3] / TILE_WIDTH;
+    uint32_t num_padded_Xt = num_total_Xt - num_unpadded_Xt;
+    uint32_t num_unpadded_Yt = a.get_legacy_shape()[2] / TILE_HEIGHT;
+    uint32_t num_total_Yt = output_shape[2] / TILE_HEIGHT;
+    uint32_t num_padded_Yt = (num_total_Yt - num_unpadded_Yt) * num_total_Xt;
+    uint32_t num_unpadded_Z = a.get_legacy_shape()[1];
+    uint32_t num_total_Z = output_shape[1];
+    uint32_t num_padded_Zt = (num_total_Z - num_unpadded_Z) * num_total_Yt * num_total_Xt;
+    uint32_t num_unpadded_W = a.get_legacy_shape()[0];
+    uint32_t num_total_W = output_shape[0];
+    uint32_t num_padded_Wt = (num_total_W - num_unpadded_W) * num_total_Z * num_total_Yt * num_total_Xt;
+
+    uint32_t num_unpadded_tiles = a.volume() / TILE_HW;
+
+    vector<uint32_t> reader_kernel_args = {
+        src0_buffer->address(),
+        num_unpadded_tiles, 0
+    };
+    vector<uint32_t> writer_kernel_args = {
+        dst_buffer->address(),
+        num_unpadded_W,
+        num_padded_Wt,
+        num_unpadded_Z,
+        num_padded_Zt,
+        num_unpadded_Yt,
+        num_padded_Yt,
+        num_unpadded_Xt,
+        num_padded_Xt,
+        packed_pad_value,
+    };
+
+    // Reader compile-time args
+    // Data is 32 byte aligned
+    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> reader_compile_time_args = {
+        // interleaved accessor args
+        (std::uint32_t) src0_is_dram
+    };
+    std::vector<uint32_t> writer_compile_time_args = {
+        // interleaved accessor args
+        (std::uint32_t) src0_cb_index,
+        (std::uint32_t) src1_cb_index,
+        (std::uint32_t) dst_is_dram
+    };
+    // Tilized reader
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp",
+        core,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_unary_pad_dims_interleaved.cpp",
+        core,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    tt::tt_metal::SetRuntimeArgs(
+        program,
+        unary_reader_kernel_id,
+        core,
+        reader_kernel_args
+    );
+
+   tt::tt_metal::SetRuntimeArgs(
+        program,
+        unary_writer_kernel_id,
+        core,
+        writer_kernel_args
+    );
+
+    auto override_runtime_args_callback = [unary_reader_kernel_id, unary_writer_kernel_id](
+        const Program &program,
+        const std::vector<Buffer*>& input_buffers,
+        const std::vector<Buffer*>& output_buffers
+    ) {
+
+        auto src_dram_buffer = input_buffers.at(0);
+
+        auto dst_dram_buffer = output_buffers.at(0);
+
+        CoreCoord core = {0, 0};
+
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, unary_reader_kernel_id, core);
+            runtime_args[0] = src_dram_buffer->address();
+        }
+
+        {
+            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, unary_writer_kernel_id, core);
+            runtime_args[0] = dst_dram_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+
+inline void log_rt_args(const CoreCoord& core,  vector<uint32_t>& args) {
+    for (auto v : args) {
+        tt::log_debug(tt::LogOp, "{},{} :: {}", core.x, core.y, v);
+    }
+}
+
+// This is currently mostly hardcoded for resnet shapes
+inline std::tuple<uint32_t, uint32_t, uint32_t, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t, uint32_t, uint32_t>
+    split_across_cores(CoreCoord grid_size, uint32_t nbatch, uint32_t nchannel, uint32_t ntiles_h, uint32_t ntiles_w) {
+
+    uint32_t ncores, ncores_h, ncores_w, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h;
+
+    ncores_h = 1;
+
+    // each batch needs to be padded independently
+    switch (nbatch) {
+        case 1:
+            ncores_h = 1;
+            nbatch_per_core_h = 1;
+            ntiles_per_core_h = 1;
+            switch (ntiles_h) {
+                case 2: ncores_h = 2; ntiles_per_core_h = 1; break;
+                case 4: ncores_h = 4; ntiles_per_core_h = 1; break;
+                case 8: ncores_h = 8; ntiles_per_core_h = 1; break;
+                case 64: ncores_h = 8; ntiles_per_core_h = 8; break;
+            }
+            ncores_per_batch_h = ncores_h;
+            break;
+
+        case 2:
+            ncores_h = 1;
+            ncores_per_batch_h = 1;
+            nbatch_per_core_h = 1;
+            ntiles_per_core_h = 1;
+            switch (ntiles_h) {
+                case 2: ncores_per_batch_h = 2; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 1; break;
+                case 4: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 1; break;
+                case 8: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 2; break;
+                case 64: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 16; break;
+            }
+            break;
+
+        case 8:
+            ncores_h = 8;
+            ncores_per_batch_h = 1;
+            nbatch_per_core_h = 1;
+            ntiles_per_core_h = ntiles_h;
+            break;
+
+        default:
+            TT_ASSERT(false, "unhandled nbatch. TODO");
+
+            // generic case -- TODO
+
+            // one of the following will be 0 when grid_size.y != nbatch
+            uint32_t nbatch_per_core_h = nbatch / grid_size.y;  // floor
+            uint32_t ncores_per_batch_h = grid_size.y / nbatch; // floor
+            if (nbatch == grid_size.y) {
+                nbatch_per_core_h = 1;
+                ncores_per_batch_h = 1;
+            }
+
+            // currently uses hardcoded values for resnet50
+            // TT_ASSERT(ntiles_h == 1 || ntiles_h == 2 || ntiles_h == 4 || ntiles_h == 16, "Only Resnet50 shapes are supported in multicore version for now.");
+            // TT_ASSERT(ntiles_w == 64, "Only Resnet50 shapes are supported in multicore version for now.");
+
+            TT_ASSERT(nbatch <= grid_size.y, "Unsupported case with nbatch > grid_size.y!");
+
+            uint32_t ncores_h = 1;
+            uint32_t ntiles_per_core_h = ntiles_h / ncores_h;
+            if (nbatch_per_core_h == 0) {
+                // there are multiple cores along h per batch
+                nbatch_per_core_h = 1;
+                ncores_h = ncores_per_batch_h * nbatch;
+                ntiles_per_core_h = ntiles_h / ncores_per_batch_h;
+            } else if (ncores_per_batch_h == 0) {
+                // unsupported case. TODO.
+                TT_ASSERT(false);
+                // there are multiple batch per core along h
+                // ncores_per_batch_h = 1;
+                // ncores_h = (uint32_t) ceil((float) nbatch / nbatch_per_core_h);
+                // ntiles_per_core_h = nbatch_per_core_h * ntiles_h;
+            } else {
+                TT_ASSERT("Something went terribly wrong in splitting acrtoss cores");
+            }
+            break;
+    }
+
+    ncores_w = 1;
+    switch (ntiles_w) {
+        case 2: ncores_w = 2; break;
+        case 4: ncores_w = 4; break;
+        case 8: ncores_w = 8; break;
+        case 64: ncores_w = 8; break;
+    }
+    ncores = ncores_h * ncores_w;
+    ntiles_per_core_w = ntiles_w / ncores_w;
+    std::set<CoreRange> all_cores;
+    std::set<CoreRange> core_range;
+
+    all_cores.insert(CoreRange(CoreCoord(0, 0), CoreCoord(ncores_w - 1, ncores_h - 1)));
+    core_range.insert(CoreRange(CoreCoord(0, 0), CoreCoord(ncores_w - 1, ncores_h - 1)));
+
+    return std::make_tuple(ncores, ncores_h, ncores_w, all_cores, core_range, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h);
+}
+
+operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core(const Tensor &a,
+                                                                Tensor &output,
+                                                                const tt::tt_metal::Shape &output_tensor_shape,
+                                                                const tt::tt_metal::Shape &input_tensor_start,
+                                                                const float pad_value) {
+    Program program{};
+
+    auto output_shape = output_tensor_shape;
+
+    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
+    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
+    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
+
+    Device *device = a.device();
+
+    // construct const buffer with the pad_value
+    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
+    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
+    auto pad_value_const_buffer = owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
+    // NOTE: The const buffer is always in L1
+    // TODO: make a local buffer for each core?
+    const Tensor pad_value_const_tensor =
+        Tensor(
+            OwnedStorage{pad_value_const_buffer},
+            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
+            DataType::BFLOAT16,
+            Layout::ROW_MAJOR)
+            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
+    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
+
+    // uint32_t ntiles_h = output_tensor_shape[0] * output_tensor_shape[1] * output_tensor_shape[2] / TILE_HEIGHT;
+    uint32_t ntiles_h = output_tensor_shape[2] / TILE_HEIGHT;
+    uint32_t ntiles_w = output_tensor_shape[3] / TILE_WIDTH;
+
+    auto grid_size = device->compute_with_storage_grid_size();
+    uint32_t nbatch = output_tensor_shape[0];
+    uint32_t nchannel = output_tensor_shape[1];
+    // first the batch dim is distributed along H, and within each batch then the tiles are distributed.
+    auto [ncores, ncores_h, ncores_w, all_cores, core_range, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h] = split_across_cores(grid_size, nbatch, nchannel, ntiles_h, ntiles_w);
+
+    int32_t src_nbytes_per_core_w = ntiles_per_core_w * TILE_WIDTH * a.element_size();
+    int32_t dst_nbytes_per_core_w = ntiles_per_core_w * TILE_WIDTH * output.element_size();
+
+    uint32_t cb_id = tt::CB::c_in0;
+    uint32_t cb_npages = 16; // multibuffering for perf
+    // uint32_t cb_npages = 1; // multibuffering for perf
+    uint32_t cb_pagesize = (uint32_t) ceil((float) dst_nbytes_per_core_w / tt::constants::TILE_WIDTH) * tt::constants::TILE_WIDTH;
+    tt::DataFormat in_df = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(cb_npages * cb_pagesize, {{cb_id, in_df}})
+		.set_page_size(cb_id, cb_pagesize);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_config);
+
+    Buffer *src0_buffer = a.buffer();
+    Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
+    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
+    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
+    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
+    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
+                                            (std::uint32_t) dst_is_dram,
+                                            (std::uint32_t) src_stick_size_is_power_of_two,
+                                            (std::uint32_t) src_log2_stick_size,
+                                            (std::uint32_t) dst_stick_size_is_power_of_two,
+                                            (std::uint32_t) dst_log2_stick_size};
+    std::vector<uint32_t> writer_ct_args = reader_ct_args;
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    bfloat16 bfloat_zero = bfloat16(0.0f);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
+
+    KernelHandle reader_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp",
+                                                        all_cores,
+                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
+    KernelHandle writer_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp",
+                                                        all_cores,
+                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
+    // int32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
+    log_rt_args(CoreCoord{0, 0}, reader_ct_args);
+
+    #if 1
+    {
+        tt::log_debug("ncores: {}", ncores);
+        tt::log_debug("ncores_h: {}", ncores_h);
+        tt::log_debug("ncores_w: {}", ncores_w);
+        tt::log_debug("ntiles_per_core_h: {}", ntiles_per_core_h);
+        tt::log_debug("ntiles_per_core_w: {}", ntiles_per_core_w);
+        tt::log_debug("src0_buffer_addr: {}", src0_buffer->address());
+        tt::log_debug("dst_buffer_addr: {}", dst_buffer->address());
+        tt::log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
+        tt::log_debug("out.shape[0]: {}", output_shape[0]);
+        tt::log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
+        tt::log_debug("out.shape[1]: {}", output_shape[1]);
+        tt::log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
+        tt::log_debug("out.shape[2]: {}", output_shape[2]);
+        tt::log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
+        tt::log_debug("out.shape[3]: {}", output_shape[3]);
+        tt::log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
+        tt::log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
+        // tt::log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
+        tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
+        tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
+        tt::log_debug("packed_pad_value: {}", packed_pad_value);
+        tt::log_debug("src_nbytes_per_core_w: {}", src_nbytes_per_core_w);
+        tt::log_debug("dst_nbytes_per_core_w: {}", dst_nbytes_per_core_w);
+        tt::log_debug("nbatch_per_core_h: {}", nbatch_per_core_h);
+        tt::log_debug("ncores_per_batch_h: {}", ncores_per_batch_h);
+    }
+    #endif
+
+    uint32_t start_src_stick_id = 0;
+    uint32_t start_dst_stick_id = 0;
+    uint32_t start_src_stick_wi = 0; // start of stick segment for 2d decomp
+    uint32_t start_dst_stick_wi = 0;
+    int32_t local_nsticks = ntiles_per_core_h * TILE_HEIGHT;
+    int32_t rem_nbatch = nbatch;    // per core h, there are ncores_per_batch_h cores, ie each batch ncores_h = ncores_per_batch_h
+    for (int32_t b = 0; b < nbatch; ++ b) {
+        int32_t rem_src_nsticks = a.get_legacy_shape()[2];
+        for (uint32_t j = 0; j < ncores_per_batch_h; ++ j) {
+            uint32_t num_local_unpadded_nsticks = local_nsticks;
+            if (rem_src_nsticks - local_nsticks >= 0) {
+                // not reached padding sticks yet
+                rem_src_nsticks -= local_nsticks;
+            } else {
+                num_local_unpadded_nsticks = rem_src_nsticks;
+                rem_src_nsticks = 0;
+            }
+            start_src_stick_wi = 0;
+            start_dst_stick_wi = 0;
+            int32_t rem_src_stick_size_nbytes = unpadded_row_size_nbytes;
+            for (uint32_t i = 0; i < ncores_w; ++ i) {
+                CoreCoord core = {i, b * ncores_per_batch_h + j};
+                uint32_t curr_stick_size_nbytes = 0;
+                int32_t curr_stick_diff_nbytes = 0;
+                if (rem_src_stick_size_nbytes - dst_nbytes_per_core_w >= 0) {
+                    // no padding on this core
+                    curr_stick_size_nbytes = dst_nbytes_per_core_w;
+                    rem_src_stick_size_nbytes -= dst_nbytes_per_core_w;
+                } else {
+                    // this core has padding
+                    curr_stick_size_nbytes = rem_src_stick_size_nbytes;
+                    curr_stick_diff_nbytes = dst_nbytes_per_core_w - curr_stick_size_nbytes;
+                    rem_src_stick_size_nbytes = 0;
+                }
+                vector<uint32_t> reader_rt_args = {src0_buffer->address(),
+                                                    dst_buffer->address(),
+                                                    a.get_legacy_shape()[0],
+                                                    output_shape[0],
+                                                    a.get_legacy_shape()[1],
+                                                    output_shape[1],
+                                                    a.get_legacy_shape()[2],
+                                                    output_shape[2],
+                                                    a.get_legacy_shape()[3],
+                                                    output_shape[3],
+                                                    curr_stick_size_nbytes,
+                                                    (uint32_t) dst_nbytes_per_core_w,
+                                                    (uint32_t) curr_stick_diff_nbytes,
+                                                    pad_value_const_tensor_addr,
+                                                    pad_value_const_buffer_nbytes,
+                                                    packed_pad_value,
+                                                    start_src_stick_id,
+                                                    start_dst_stick_id,
+                                                    start_src_stick_wi,
+                                                    start_dst_stick_wi,
+                                                    start_src_stick_wi * a.element_size(),
+                                                    (uint32_t) local_nsticks,
+                                                    num_local_unpadded_nsticks,
+                                                    unpadded_row_size_nbytes,
+                                                    padded_row_size_nbytes,
+                                                    start_dst_stick_wi * output.element_size(),
+                                                    nbatch_per_core_h
+                                                    };
+                // if (core.x == 0) log_rt_args(core, reader_rt_args);
+                // if (core.x == 0) {
+                //     log_debug("{} :: start_src_stick_id: {}", core.y, start_src_stick_id);
+                //     log_debug("{} :: start_dst_stick_id: {}", core.y, start_dst_stick_id);
+                //     log_debug("{} :: local_nsticks: {}", core.y, local_nsticks);
+                //     log_debug("{} :: num_local_unpadded_nsticks: {}", core.y, num_local_unpadded_nsticks);
+                //     log_debug("{} :: nbatch_per_core_h: {}", core.y, nbatch_per_core_h);
+                //     log_debug("{} :: ncores_per_batch_h: {}", core.y, ncores_per_batch_h);
+                // }
+                vector<uint32_t> writer_rt_args = reader_rt_args;
+                tt::tt_metal::SetRuntimeArgs(program,
+                                reader_kernel_id,
+                                core,
+                                reader_rt_args);
+                tt::tt_metal::SetRuntimeArgs(program,
+                                writer_kernel_id,
+                                core,
+                                writer_rt_args);
+                start_src_stick_wi += ntiles_per_core_w * TILE_WIDTH;
+                start_dst_stick_wi += ntiles_per_core_w * TILE_WIDTH;
+            } // for ncores_w
+            start_src_stick_id += num_local_unpadded_nsticks;
+            start_dst_stick_id += local_nsticks;
+        } // for ncores_h
+    }
+
+    auto override_runtime_args_callback = [reader_kernel_id = reader_kernel_id,
+                                           writer_kernel_id = writer_kernel_id,
+                                           ncores_h = ncores_h,
+                                           ncores_w = ncores_w](
+                                              const Program &program,
+                                              const std::vector<Buffer *> &input_buffers,
+                                              const std::vector<Buffer *> &output_buffers) {
+        auto src_buffer = input_buffers.at(0);
+        auto dst_buffer = output_buffers.at(0);
+
+        for (uint32_t j = 0; j < ncores_h; ++ j) {
+            for (uint32_t i = 0; i < ncores_w; ++ i) {
+                CoreCoord core = {i, j};
+                {
+                    auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, reader_kernel_id, core);
+                    runtime_args[0] = src_buffer->address();
+                    runtime_args[1] = dst_buffer->address();
+                }
+                {
+                    auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, writer_kernel_id, core);
+                    runtime_args[0] = src_buffer->address();
+                    runtime_args[1] = dst_buffer->address();
+                }
+            }
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+
+std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t> > > get_runtime_args_rm(const Tensor &input_tensor,
+                                                                                        Tensor &output_tensor,
+                                                                                        uint32_t num_cores_total,
+                                                                                        uint32_t num_cores,
+                                                                                        uint32_t num_cores_y,
+                                                                                        CoreRangeSet core_group_1,
+                                                                                        uint32_t num_w_sticks_per_core_group_1,
+                                                                                        CoreRangeSet core_group_2,
+                                                                                        uint32_t num_w_sticks_per_core_group_2
+                                                                                        ){
+
+    auto input_buffer = input_tensor.buffer();
+    auto output_buffer = output_tensor.buffer();
+    auto input_shape = input_tensor.get_legacy_shape();
+    auto output_shape = output_tensor.get_legacy_shape();
+
+    uint32_t W = input_shape[3], H = input_shape[2], C = input_shape[1], N = input_shape[0];
+    uint32_t W_bytes = W * input_tensor.element_size();
+
+    uint32_t W_padded = output_shape[3], H_padded = output_shape[2], C_padded = output_shape[1], N_padded = output_shape[0];
+    uint32_t W_padded_bytes = W_padded * input_tensor.element_size();
+
+    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
+    std::vector<uint32_t> start_dim_offset(num_dims, 0);
+
+
+    std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t> > > ret_val(num_cores_total);
+
+
+    uint32_t max_read_size = 2048;
+    uint32_t curr_c = 0, curr_h = 0, curr_n = 0;
+    for(uint32_t i = 0, curr_sticks_read = 0, curr_sticks_write = 0; i < num_cores_total; i++) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+
+        uint32_t num_sticks_per_core;
+        if (core_group_1.core_coord_in_core_ranges(core)) {
+            num_sticks_per_core = num_w_sticks_per_core_group_1;
+        } else if (core_group_2.core_coord_in_core_ranges(core)) {
+            num_sticks_per_core = num_w_sticks_per_core_group_2;
+        } else {
+            //no-op
+            num_sticks_per_core = 0;
+        }
+
+
+        // issue more reads before calling barrier
+        uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
+        if (num_sticks_per_core != 0) {
+            num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core, W_bytes, max_read_size);
+            num_read_per_barrier = num_sticks_per_core / num_sticks_per_core_read;
+        }
+
+        // reader
+        std::vector<uint32_t> reader_runtime_args = {
+            input_buffer->address(),
+            num_sticks_per_core_read,
+            num_read_per_barrier,
+            curr_sticks_read,
+        };
+        reader_runtime_args.insert(reader_runtime_args.end(), start_dim_offset.begin(), start_dim_offset.end());
+
+        // writer
+        std::vector<uint32_t> writer_runtime_args = {
+            output_buffer->address(),
+            num_sticks_per_core_read,
+            num_read_per_barrier,
+            curr_sticks_write
+        };
+
+        ret_val[i] = {reader_runtime_args, writer_runtime_args};
+
+        curr_sticks_write += num_sticks_per_core;
+
+        for (uint32_t i = 0; i < num_sticks_per_core; ++i) {
+
+            if (curr_h < H and curr_c < C and curr_n < N) {
+                curr_sticks_read++;
+            }
+
+            curr_h++;
+            if (curr_h == H_padded) {
+                curr_c++;
+                curr_h = 0;
+                if (curr_c == C_padded) {
+                    curr_n++;
+                    curr_c = 0;
+                }
+            }
+        }
+
+        start_dim_offset = {0, curr_h, curr_c, curr_n};
+
+    }
+
+    return ret_val;
+}
+
+operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(const Tensor &a,
+                                                                Tensor &output,
+                                                                const tt::tt_metal::Shape &output_tensor_shape,
+                                                                const tt::tt_metal::Shape &input_tensor_start,
+                                                                const float pad_value) {
+    Program program{};
+
+    auto output_shape = output_tensor_shape;
+    uint32_t W = a.shape()[3], H = a.shape()[2], C = a.shape()[1], N = a.shape()[0];
+    uint32_t NCH = H * C * N;
+    uint32_t W_padded = output_tensor_shape[3], H_padded = output_tensor_shape[2], C_padded = output_tensor_shape[1], N_padded = output_tensor_shape[0];
+    uint32_t NCH_padded = H_padded * C_padded * N_padded;
+
+    auto stick_size = W * a.element_size();
+    auto stick_size_padded = W_padded * a.element_size();
+    auto rem_stick_size_padded = stick_size_padded - stick_size;
+    uint32_t row_major_min_bytes = 16;
+
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+
+    Device *device = a.device();
+
+    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+    uint32_t num_cores_total = num_cores_x * num_cores_y;
+    CoreRange total_cores({0, 0}, {num_cores_x-1, num_cores_y-1});
+
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_padded_per_core_group_1, num_sticks_padded_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, NCH_padded);
+
+    uint32_t src0_cb_index = 0;
+    auto num_sticks = num_sticks_padded_per_core_group_1 > num_sticks_padded_per_core_group_2 ? num_sticks_padded_per_core_group_1 : num_sticks_padded_per_core_group_2;
+
+    tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded, {{src0_cb_index, cb_data_format}})
+        .set_page_size(src0_cb_index, stick_size_padded);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
+
+    // construct const buffer with the pad_value
+    bool not_pad_by_zero = pad_value != 0;
+    if (not_pad_by_zero) {
+        uint32_t src1_cb_index = 1;
+        tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(row_major_min_bytes, {{src1_cb_index, cb_data_format}})
+            .set_page_size(src1_cb_index, row_major_min_bytes);
+        auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config);
+    }
+
+    Buffer *src0_buffer = a.buffer();
+    Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
+    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(stick_size) : 0;
+    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size_padded);
+    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(stick_size_padded) : 0;
+    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
+                                            (std::uint32_t) N,
+                                            (std::uint32_t) H,
+                                            (std::uint32_t) C,
+                                            (std::uint32_t) stick_size,
+                                            (std::uint32_t) N_padded,
+                                            (std::uint32_t) H_padded,
+                                            (std::uint32_t) C_padded,
+                                            (std::uint32_t) stick_size_padded,
+                                            (std::uint32_t) (stick_size_padded - stick_size),
+                                            (std::uint32_t) not_pad_by_zero,
+                                            (std::uint32_t) packed_pad_value,
+                                            (std::uint32_t) row_major_min_bytes,
+                                            (std::uint32_t) (rem_stick_size_padded / row_major_min_bytes),
+                                            (std::uint32_t) (stick_size_padded / row_major_min_bytes),
+                                            (std::uint32_t) src_stick_size_is_power_of_two,
+                                            (std::uint32_t) src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size};
+    std::vector<uint32_t> writer_ct_args = {(std::uint32_t) src0_cb_index,
+                                            (std::uint32_t) dst_is_dram,
+                                            (std::uint32_t) stick_size_padded,
+                                            (std::uint32_t) dst_stick_size_is_power_of_two,
+                                            (std::uint32_t) dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded};
+
+    KernelHandle reader_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp",
+                                                        total_cores,
+                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
+    KernelHandle writer_kernel_id = CreateKernel(program,
+                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp",
+                                                        total_cores,
+                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
+
+    auto all_runtime_args = get_runtime_args_rm(a, output, num_cores_total, num_cores, num_cores_y, core_group_1, num_sticks_padded_per_core_group_1, core_group_2, num_sticks_padded_per_core_group_2);
+
+    for(uint32_t i = 0; i < num_cores_total; i++) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        tt::tt_metal::SetRuntimeArgs(
+            program,
+            reader_kernel_id,
+            core,
+            all_runtime_args[i].first
+        );
+
+        tt::tt_metal::SetRuntimeArgs(
+            program,
+            writer_kernel_id,
+            core,
+            all_runtime_args[i].second
+
+        );
+    }
+
+    auto override_runtime_args_callback = [
+            reader_kernel_id,
+            writer_kernel_id,
+            compute_with_storage_grid_size
+        ]
+    (
+        const void* operation,
+        const Program& program,
+        const std::vector<Tensor>& input_tensors,
+        const std::vector<std::optional<const Tensor>>&,
+        const std::vector<Tensor>& output_tensors
+    ) {
+        auto src_tensor = input_tensors.at(0);
+
+        auto dst_tensor = output_tensors.at(0);
+
+        uint32_t num_cores_x = compute_with_storage_grid_size.x;
+        uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+        uint32_t num_cores_total = num_cores_x * num_cores_y;
+
+        auto output_tensor_shape = dst_tensor.shape();
+        uint32_t W_padded = output_tensor_shape[3], H_padded = output_tensor_shape[2], C_padded = output_tensor_shape[1], N_padded = output_tensor_shape[0];
+        uint32_t NCH_padded = H_padded * C_padded * N_padded;
+
+        auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_padded_per_core_group_1, num_sticks_padded_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, NCH_padded);
+        auto all_runtime_args = get_runtime_args_rm(src_tensor, dst_tensor, num_cores_total, num_cores, num_cores_y, core_group_1, num_sticks_padded_per_core_group_1, core_group_2, num_sticks_padded_per_core_group_2);
+
+        for(uint32_t i = 0; i < num_cores_total; i++) {
+            CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+            {
+                SetRuntimeArgs(program, reader_kernel_id, core, all_runtime_args[i].first);
+            }
+
+            {
+                SetRuntimeArgs(program, writer_kernel_id, core, all_runtime_args[i].second);
+            }
+        }
+    };
+
+    return {.program=std::move(program), .override_runtime_arguments_callback=override_runtime_args_callback};
+}
+
+
+
+} // namespace ttnn::operations::reduction::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
index 5320e1cb98a..f28241392fb 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
@@ -2,14 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/tensor/host_buffer/functions.hpp"
 #include "tt_metal/host_api.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_log.h"
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -18,1109 +11,33 @@ operation::ProgramWithCallbacks pad_rm_reader_writer(const Tensor &a,
                                                      Tensor &output,
                                                      const tt::tt_metal::Shape &output_tensor_shape,
                                                      const tt::tt_metal::Shape &input_tensor_start,
-                                                     const float pad_value) {
-    Program program{};
-
-    auto output_shape = output_tensor_shape;
-
-    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
-    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
-    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
-
-    // construct const buffer with the pad_value
-    Device *device = a.device();
-    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
-    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
-    auto pad_value_const_buffer = tt::tt_metal::owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
-    const Tensor pad_value_const_tensor =
-        Tensor(
-            OwnedStorage{pad_value_const_buffer},
-            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
-            DataType::BFLOAT16,
-            Layout::ROW_MAJOR)
-            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
-    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
-
-    CoreRange cores({0, 0}, {0, 0});
-    uint32_t cb_id = tt::CB::c_in0;
-    uint32_t cb_npages = 16; // multibuffering
-    uint32_t cb_pagesize = tt::round_up(padded_row_size_nbytes, tt::constants::TILE_WIDTH);
-    tt::DataFormat in_df = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(cb_npages * cb_pagesize, {{cb_id, in_df}})
-		.set_page_size(cb_id, cb_pagesize);
-    auto cb = tt::tt_metal::CreateCircularBuffer(program, cores, cb_config);
-
-
-    Buffer *src0_buffer = a.buffer();
-    Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
-    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
-    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
-    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
-    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
-                                            (std::uint32_t) dst_is_dram,
-                                            (std::uint32_t) src_stick_size_is_power_of_two,
-                                            (std::uint32_t) src_log2_stick_size,
-                                            (std::uint32_t) dst_stick_size_is_power_of_two,
-                                            (std::uint32_t) dst_log2_stick_size};
-    std::vector<uint32_t> writer_ct_args = reader_ct_args;
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    bfloat16 bfloat_zero = bfloat16(0.0f);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
-
-    KernelHandle reader_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp",
-                                                        cores,
-                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
-    KernelHandle writer_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp",
-                                                        cores,
-                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
-    uint32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
+                                                     const float pad_value); 
 
-    #if 0
-    {
-        log_debug("src0_buffer_addr: {}", src0_buffer->address());
-        log_debug("dst_buffer_addr: {}", dst_buffer->address());
-        log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
-        log_debug("out.shape[0]: {}", output_shape[0]);
-        log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
-        log_debug("out.shape[1]: {}", output_shape[1]);
-        log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
-        log_debug("out.shape[2]: {}", output_shape[2]);
-        log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
-        log_debug("out.shape[3]: {}", output_shape[3]);
-        log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
-        log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
-        log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
-        log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
-        log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
-        log_debug("packed_pad_value: {}", packed_pad_value);
-    }
-    #endif
-
-    uint32_t start_src_stick_id = 0;
-    uint32_t start_dst_stick_id = 0;
-    vector<uint32_t> reader_rt_args = {src0_buffer->address(),
-                                       dst_buffer->address(),
-                                       a.get_legacy_shape()[0],
-                                       output_shape[0],
-                                       a.get_legacy_shape()[1],
-                                       output_shape[1],
-                                       a.get_legacy_shape()[2],
-                                       output_shape[2],
-                                       a.get_legacy_shape()[3],
-                                       output_shape[3],
-                                       unpadded_row_size_nbytes,
-                                       padded_row_size_nbytes,
-                                       padded_row_diff_size_nbytes,
-                                       pad_value_const_tensor_addr,
-                                       pad_value_const_buffer_nbytes,
-                                       packed_pad_value,
-                                       start_src_stick_id,
-                                       start_dst_stick_id,
-                                       0,
-                                       0,
-                                       0,
-                                       output_shape[2],
-                                       a.get_legacy_shape()[2],
-                                       unpadded_row_size_nbytes,
-                                       padded_row_size_nbytes,
-                                       0,
-                                       output.get_legacy_shape()[0]
-                                       };
-    vector<uint32_t> writer_rt_args = reader_rt_args;
-    tt::tt_metal::SetRuntimeArgs(program,
-                   reader_kernel_id,
-                   cores,
-                   reader_rt_args);
-    tt::tt_metal::SetRuntimeArgs(program,
-                   writer_kernel_id,
-                   cores,
-                   writer_rt_args);
-
-    auto override_runtime_args_callback =
-        [reader_kernel_id=reader_kernel_id, writer_kernel_id=writer_kernel_id](
-            const Program &program,
-            const std::vector<Buffer*>& input_buffers,
-            const std::vector<Buffer*>& output_buffers) {
-        auto src_buffer = input_buffers.at(0);
-        auto dst_buffer = output_buffers.at(0);
-        CoreCoord core = {0, 0};
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, reader_kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-            runtime_args[1] = dst_buffer->address();
-        }
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, writer_kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-            runtime_args[1] = dst_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
 
 operation::ProgramWithCallbacks pad_rm_opt(const Tensor &a,
                                            Tensor &output,
                                            const Shape &output_tensor_shape,
                                            const Shape &input_tensor_start,
-                                           const float pad_value) {
-    Program program{};
-
-    auto output_shape = output_tensor_shape;
-
-    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
-    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
-    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
-
-    Device *device = a.device();
-    auto dst_buffer_l1 = Buffer(device, padded_row_size_nbytes, padded_row_size_nbytes, BufferType::L1);
-
-    // construct const buffer with the pad_value
-    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
-    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
-    auto pad_value_const_buffer = owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
-    const Tensor pad_value_const_tensor =
-        Tensor(
-            OwnedStorage{pad_value_const_buffer},
-            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
-            DataType::BFLOAT16,
-            Layout::ROW_MAJOR)
-            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
-    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
-
-    Buffer *src0_buffer = a.buffer();
-    Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
-    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
-    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
-    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
-    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
-                                            (std::uint32_t) dst_is_dram,
-                                            (std::uint32_t) src_stick_size_is_power_of_two,
-                                            (std::uint32_t) src_log2_stick_size,
-                                            (std::uint32_t) dst_stick_size_is_power_of_two,
-                                            (std::uint32_t) dst_log2_stick_size};
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    bfloat16 bfloat_zero = bfloat16(0.0f);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
-
-    CoreRange core({0, 0}, {0, 0});
-    KernelHandle reader_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/pad_dims_rm_interleaved_opt.cpp",
-                                                        core,
-                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
-    uint32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
-
-    #if 0
-    {
-        tt::log_debug("src0_buffer_addr: {}", src0_buffer->address());
-        tt::log_debug("dst_buffer_addr: {}", dst_buffer->address());
-        tt::log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
-        tt::log_debug("out.shape[0]: {}", output_shape[0]);
-        tt::log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
-        tt::log_debug("out.shape[1]: {}", output_shape[1]);
-        tt::log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
-        tt::log_debug("out.shape[2]: {}", output_shape[2]);
-        tt::log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
-        tt::log_debug("out.shape[3]: {}", output_shape[3]);
-        tt::log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
-        tt::log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
-        tt::log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
-        tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
-        tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
-        tt::log_debug("packed_pad_value: {}", packed_pad_value);
-        tt::log_debug("dst_buffer_l1_addr: {}", dst_buffer_l1.address());
-    }
-    #endif
-
-    vector<uint32_t> reader_rt_args = {src0_buffer->address(),
-                                       dst_buffer->address(),
-                                       a.get_legacy_shape()[0],
-                                       output_shape[0],
-                                       a.get_legacy_shape()[1],
-                                       output_shape[1],
-                                       a.get_legacy_shape()[2],
-                                       output_shape[2],
-                                       a.get_legacy_shape()[3],
-                                       output_shape[3],
-                                       unpadded_row_size_nbytes,
-                                       padded_row_size_nbytes,
-                                       padded_row_diff_size_nbytes,
-                                       pad_value_const_tensor_addr,
-                                       pad_value_const_buffer_nbytes,
-                                       packed_pad_value,
-                                       dst_buffer_l1.address()};
-    tt::tt_metal::SetRuntimeArgs(program,
-                   reader_kernel_id,
-                   core,
-                   reader_rt_args);
-
-    auto override_runtime_args_callback = [kernel_id=reader_kernel_id](const Program &program,
-                                                                             const std::vector<Buffer*>& input_buffers,
-                                                                             const std::vector<Buffer*>& output_buffers) {
-        auto src_buffer = input_buffers.at(0);
-        auto dst_buffer = output_buffers.at(0);
-        CoreCoord core = {0, 0};
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-            runtime_args[1] = dst_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks pad_rm(const Tensor &a, Tensor &output, const Shape &output_tensor_shape, const Shape &input_tensor_start, const float pad_value) {
-
-    tt::tt_metal::Program program{};
-
-    CoreRange core({0, 0}, {0, 0});
-
-    // This should allocate a DRAM buffer on the device
-    tt::tt_metal::Device *device = a.device();
-
-    auto output_shape = output_tensor_shape;
-
-    tt::tt_metal::Buffer *src0_buffer = a.buffer();
-
-    uint32_t unpadded_row_size_bytes = a.get_legacy_shape()[3] * a.element_size();
-    uint32_t padded_row_size_bytes = output_shape[3] * a.element_size();
-
-    tt::tt_metal::Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    uint32_t src_stick_size = unpadded_row_size_bytes;
-    uint32_t dst_stick_size = padded_row_size_bytes;
-
-    uint32_t dst_buffer_size = dst_stick_size;
-
-    tt::tt_metal::InterleavedBufferConfig buff_config{
-                    .device= device,
-                    .size = dst_buffer_size,
-                    .page_size = dst_buffer_size,
-                    .buffer_type = tt::tt_metal::BufferType::L1
-        };
-
-    auto dst_buffer_l1 = tt::tt_metal::CreateBuffer(buff_config);
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    vector<uint32_t> reader_kernel_args = {
-        src0_buffer->address(),
-        dst_buffer->address(),
-        a.get_legacy_shape()[0],
-        output_shape[0],
-        a.get_legacy_shape()[1],
-        output_shape[1],
-        a.get_legacy_shape()[2],
-        output_shape[2],
-        a.get_legacy_shape()[3],
-        output_shape[3],
-        unpadded_row_size_bytes,
-        padded_row_size_bytes,
-        padded_row_size_bytes - unpadded_row_size_bytes,
-        packed_pad_value,
-        dst_buffer_l1->address()
-    };
-    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    bool src_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(src_stick_size);
-    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(src_stick_size) : 0;
-    bool dst_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(dst_stick_size);
-    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(dst_stick_size) : 0;
-    std::vector<uint32_t> compile_time_args_vec = {
-        (std::uint32_t) src0_is_dram,
-        (std::uint32_t) dst_is_dram,
-        (std::uint32_t) src_stick_size_is_power_of_two,
-        (std::uint32_t) src_log2_stick_size,
-        (std::uint32_t) dst_stick_size_is_power_of_two,
-        (std::uint32_t) dst_log2_stick_size,
-
-    };
-
-    // Tilized reader
-    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/pad_dims_rm_interleaved.cpp",
-        core,
-        tt::tt_metal::ReaderDataMovementConfig(compile_time_args_vec));
-
-    tt::tt_metal::SetRuntimeArgs(
-        program,
-        unary_reader_kernel_id,
-        core,
-        reader_kernel_args
-    );
-
-    auto override_runtime_args_callback = [kernel_id=unary_reader_kernel_id](
-        const Program &program,
-        const std::vector<Buffer*>& input_buffers,
-        const std::vector<Buffer*>& output_buffers
-    ) {
-
-        auto src_buffer = input_buffers.at(0);
-        auto dst_buffer = output_buffers.at(0);
-
-        CoreCoord core = {0, 0};
-
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-            runtime_args[1] = dst_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks pad_tile(const Tensor &a, Tensor& output, const tt::tt_metal::Shape &output_tensor_shape, const tt::tt_metal::Shape &input_tensor_start, const float pad_value) {
-
-    tt::tt_metal::Program program{};
-
-    CoreRange core({0, 0}, {0, 0});
-
-    // This should allocate a DRAM buffer on the device
-    tt::tt_metal::Device *device = a.device();
-
-    auto output_shape = output_tensor_shape;
-
-    tt::tt_metal::Buffer *src0_buffer = a.buffer();
-
-    tt::tt_metal::Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t single_tile_size = tt::tt_metal::detail::TileSize(cb_data_format);
-
-    tt::log_debug("pad_tile");
-    tt::log_debug("cb_data_format: {}", cb_data_format);
-    tt::log_debug("single_tile_size: {}", single_tile_size);
-    tt::log_debug("output_tensor_shape: {}", output_tensor_shape);
-    tt::log_debug("input_tensor_start: {}", input_tensor_start);
-    tt::log_debug("pad_value: {}", pad_value);
-
-    uint32_t src0_cb_index = 0;
-    uint32_t num_input_tiles = 2;
-    tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
-		.set_page_size(src0_cb_index, single_tile_size);
-    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
-
-    uint32_t src1_cb_index = 1; // For pad buffer
-    uint32_t num_pad_tiles = 1;
-    tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(num_pad_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
-		.set_page_size(src1_cb_index, single_tile_size);
-    auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    uint32_t num_unpadded_Xt = a.get_legacy_shape()[3] / TILE_WIDTH;
-    uint32_t num_total_Xt = output_shape[3] / TILE_WIDTH;
-    uint32_t num_padded_Xt = num_total_Xt - num_unpadded_Xt;
-    uint32_t num_unpadded_Yt = a.get_legacy_shape()[2] / TILE_HEIGHT;
-    uint32_t num_total_Yt = output_shape[2] / TILE_HEIGHT;
-    uint32_t num_padded_Yt = (num_total_Yt - num_unpadded_Yt) * num_total_Xt;
-    uint32_t num_unpadded_Z = a.get_legacy_shape()[1];
-    uint32_t num_total_Z = output_shape[1];
-    uint32_t num_padded_Zt = (num_total_Z - num_unpadded_Z) * num_total_Yt * num_total_Xt;
-    uint32_t num_unpadded_W = a.get_legacy_shape()[0];
-    uint32_t num_total_W = output_shape[0];
-    uint32_t num_padded_Wt = (num_total_W - num_unpadded_W) * num_total_Z * num_total_Yt * num_total_Xt;
-
-    uint32_t num_unpadded_tiles = a.volume() / TILE_HW;
-
-    vector<uint32_t> reader_kernel_args = {
-        src0_buffer->address(),
-        num_unpadded_tiles, 0
-    };
-    vector<uint32_t> writer_kernel_args = {
-        dst_buffer->address(),
-        num_unpadded_W,
-        num_padded_Wt,
-        num_unpadded_Z,
-        num_padded_Zt,
-        num_unpadded_Yt,
-        num_padded_Yt,
-        num_unpadded_Xt,
-        num_padded_Xt,
-        packed_pad_value,
-    };
-
-    // Reader compile-time args
-    // Data is 32 byte aligned
-    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> reader_compile_time_args = {
-        // interleaved accessor args
-        (std::uint32_t) src0_is_dram
-    };
-    std::vector<uint32_t> writer_compile_time_args = {
-        // interleaved accessor args
-        (std::uint32_t) src0_cb_index,
-        (std::uint32_t) src1_cb_index,
-        (std::uint32_t) dst_is_dram
-    };
-    // Tilized reader
-    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp",
-        core,
-        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
-
-    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_unary_pad_dims_interleaved.cpp",
-        core,
-        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
-
-    tt::tt_metal::SetRuntimeArgs(
-        program,
-        unary_reader_kernel_id,
-        core,
-        reader_kernel_args
-    );
-
-   tt::tt_metal::SetRuntimeArgs(
-        program,
-        unary_writer_kernel_id,
-        core,
-        writer_kernel_args
-    );
-
-    auto override_runtime_args_callback = [unary_reader_kernel_id, unary_writer_kernel_id](
-        const Program &program,
-        const std::vector<Buffer*>& input_buffers,
-        const std::vector<Buffer*>& output_buffers
-    ) {
-
-        auto src_dram_buffer = input_buffers.at(0);
-
-        auto dst_dram_buffer = output_buffers.at(0);
-
-        CoreCoord core = {0, 0};
-
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, unary_reader_kernel_id, core);
-            runtime_args[0] = src_dram_buffer->address();
-        }
-
-        {
-            auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, unary_writer_kernel_id, core);
-            runtime_args[0] = dst_dram_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-
-inline void log_rt_args(const CoreCoord& core,  vector<uint32_t>& args) {
-    for (auto v : args) {
-        tt::log_debug(tt::LogOp, "{},{} :: {}", core.x, core.y, v);
-    }
-}
-
-// This is currently mostly hardcoded for resnet shapes
-inline std::tuple<uint32_t, uint32_t, uint32_t, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t, uint32_t, uint32_t>
-    split_across_cores(CoreCoord grid_size, uint32_t nbatch, uint32_t nchannel, uint32_t ntiles_h, uint32_t ntiles_w) {
-
-    uint32_t ncores, ncores_h, ncores_w, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h;
+                                           const float pad_value); 
 
-    ncores_h = 1;
+operation::ProgramWithCallbacks pad_rm(const Tensor &a, Tensor &output, const Shape &output_tensor_shape, const Shape &input_tensor_start, const float pad_value);
 
-    // each batch needs to be padded independently
-    switch (nbatch) {
-        case 1:
-            ncores_h = 1;
-            nbatch_per_core_h = 1;
-            ntiles_per_core_h = 1;
-            switch (ntiles_h) {
-                case 2: ncores_h = 2; ntiles_per_core_h = 1; break;
-                case 4: ncores_h = 4; ntiles_per_core_h = 1; break;
-                case 8: ncores_h = 8; ntiles_per_core_h = 1; break;
-                case 64: ncores_h = 8; ntiles_per_core_h = 8; break;
-            }
-            ncores_per_batch_h = ncores_h;
-            break;
-
-        case 2:
-            ncores_h = 1;
-            ncores_per_batch_h = 1;
-            nbatch_per_core_h = 1;
-            ntiles_per_core_h = 1;
-            switch (ntiles_h) {
-                case 2: ncores_per_batch_h = 2; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 1; break;
-                case 4: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 1; break;
-                case 8: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 2; break;
-                case 64: ncores_per_batch_h = 4; ncores_h = ncores_per_batch_h * nbatch; ntiles_per_core_h = 16; break;
-            }
-            break;
-
-        case 8:
-            ncores_h = 8;
-            ncores_per_batch_h = 1;
-            nbatch_per_core_h = 1;
-            ntiles_per_core_h = ntiles_h;
-            break;
-
-        default:
-            TT_ASSERT(false, "unhandled nbatch. TODO");
-
-            // generic case -- TODO
-
-            // one of the following will be 0 when grid_size.y != nbatch
-            uint32_t nbatch_per_core_h = nbatch / grid_size.y;  // floor
-            uint32_t ncores_per_batch_h = grid_size.y / nbatch; // floor
-            if (nbatch == grid_size.y) {
-                nbatch_per_core_h = 1;
-                ncores_per_batch_h = 1;
-            }
-
-            // currently uses hardcoded values for resnet50
-            // TT_ASSERT(ntiles_h == 1 || ntiles_h == 2 || ntiles_h == 4 || ntiles_h == 16, "Only Resnet50 shapes are supported in multicore version for now.");
-            // TT_ASSERT(ntiles_w == 64, "Only Resnet50 shapes are supported in multicore version for now.");
-
-            TT_ASSERT(nbatch <= grid_size.y, "Unsupported case with nbatch > grid_size.y!");
-
-            uint32_t ncores_h = 1;
-            uint32_t ntiles_per_core_h = ntiles_h / ncores_h;
-            if (nbatch_per_core_h == 0) {
-                // there are multiple cores along h per batch
-                nbatch_per_core_h = 1;
-                ncores_h = ncores_per_batch_h * nbatch;
-                ntiles_per_core_h = ntiles_h / ncores_per_batch_h;
-            } else if (ncores_per_batch_h == 0) {
-                // unsupported case. TODO.
-                TT_ASSERT(false);
-                // there are multiple batch per core along h
-                // ncores_per_batch_h = 1;
-                // ncores_h = (uint32_t) ceil((float) nbatch / nbatch_per_core_h);
-                // ntiles_per_core_h = nbatch_per_core_h * ntiles_h;
-            } else {
-                TT_ASSERT("Something went terribly wrong in splitting acrtoss cores");
-            }
-            break;
-    }
-
-    ncores_w = 1;
-    switch (ntiles_w) {
-        case 2: ncores_w = 2; break;
-        case 4: ncores_w = 4; break;
-        case 8: ncores_w = 8; break;
-        case 64: ncores_w = 8; break;
-    }
-    ncores = ncores_h * ncores_w;
-    ntiles_per_core_w = ntiles_w / ncores_w;
-    std::set<CoreRange> all_cores;
-    std::set<CoreRange> core_range;
-
-    all_cores.insert(CoreRange(CoreCoord(0, 0), CoreCoord(ncores_w - 1, ncores_h - 1)));
-    core_range.insert(CoreRange(CoreCoord(0, 0), CoreCoord(ncores_w - 1, ncores_h - 1)));
-
-    return std::make_tuple(ncores, ncores_h, ncores_w, all_cores, core_range, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h);
-}
+operation::ProgramWithCallbacks pad_tile(const Tensor &a, Tensor& output, const tt::tt_metal::Shape &output_tensor_shape, const tt::tt_metal::Shape &input_tensor_start, const float pad_value); 
 
 operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core(const Tensor &a,
                                                                 Tensor &output,
                                                                 const tt::tt_metal::Shape &output_tensor_shape,
                                                                 const tt::tt_metal::Shape &input_tensor_start,
-                                                                const float pad_value) {
-    Program program{};
-
-    auto output_shape = output_tensor_shape;
-
-    uint32_t unpadded_row_size_nbytes = a.get_legacy_shape()[3] * a.element_size();
-    uint32_t padded_row_size_nbytes = output_shape[3] * a.element_size();   // Assuming output is same datatype as input
-    TT_ASSERT(unpadded_row_size_nbytes <= padded_row_size_nbytes, "Padded output tensor size should be >= input tensor size");
-
-    Device *device = a.device();
-
-    // construct const buffer with the pad_value
-    uint32_t pad_value_const_buffer_size = 32;  // noc transfers in chunks of 32
-    uint32_t pad_value_const_buffer_nbytes = pad_value_const_buffer_size * a.element_size();
-    auto pad_value_const_buffer = owned_buffer::create(std::vector<bfloat16>(pad_value_const_buffer_size, bfloat16(pad_value)));
-    // NOTE: The const buffer is always in L1
-    // TODO: make a local buffer for each core?
-    const Tensor pad_value_const_tensor =
-        Tensor(
-            OwnedStorage{pad_value_const_buffer},
-            Shape(std::array<uint32_t, 4>{1, 1, 1, pad_value_const_buffer_size}),
-            DataType::BFLOAT16,
-            Layout::ROW_MAJOR)
-            .to(device, MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1});
-    auto pad_value_const_tensor_addr = pad_value_const_tensor.buffer()->address();
-
-    // uint32_t ntiles_h = output_tensor_shape[0] * output_tensor_shape[1] * output_tensor_shape[2] / TILE_HEIGHT;
-    uint32_t ntiles_h = output_tensor_shape[2] / TILE_HEIGHT;
-    uint32_t ntiles_w = output_tensor_shape[3] / TILE_WIDTH;
-
-    auto grid_size = device->compute_with_storage_grid_size();
-    uint32_t nbatch = output_tensor_shape[0];
-    uint32_t nchannel = output_tensor_shape[1];
-    // first the batch dim is distributed along H, and within each batch then the tiles are distributed.
-    auto [ncores, ncores_h, ncores_w, all_cores, core_range, ntiles_per_core_h, ntiles_per_core_w, nbatch_per_core_h, ncores_per_batch_h] = split_across_cores(grid_size, nbatch, nchannel, ntiles_h, ntiles_w);
-
-    int32_t src_nbytes_per_core_w = ntiles_per_core_w * TILE_WIDTH * a.element_size();
-    int32_t dst_nbytes_per_core_w = ntiles_per_core_w * TILE_WIDTH * output.element_size();
-
-    uint32_t cb_id = tt::CB::c_in0;
-    uint32_t cb_npages = 16; // multibuffering for perf
-    // uint32_t cb_npages = 1; // multibuffering for perf
-    uint32_t cb_pagesize = (uint32_t) ceil((float) dst_nbytes_per_core_w / tt::constants::TILE_WIDTH) * tt::constants::TILE_WIDTH;
-    tt::DataFormat in_df = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(cb_npages * cb_pagesize, {{cb_id, in_df}})
-		.set_page_size(cb_id, cb_pagesize);
-    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_config);
-
-    Buffer *src0_buffer = a.buffer();
-    Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(unpadded_row_size_nbytes);
-    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(unpadded_row_size_nbytes) : 0;
-    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(padded_row_size_nbytes);
-    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(padded_row_size_nbytes) : 0;
-    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
-                                            (std::uint32_t) dst_is_dram,
-                                            (std::uint32_t) src_stick_size_is_power_of_two,
-                                            (std::uint32_t) src_log2_stick_size,
-                                            (std::uint32_t) dst_stick_size_is_power_of_two,
-                                            (std::uint32_t) dst_log2_stick_size};
-    std::vector<uint32_t> writer_ct_args = reader_ct_args;
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    bfloat16 bfloat_zero = bfloat16(0.0f);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_zero, bfloat_pad_value});
-
-    KernelHandle reader_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp",
-                                                        all_cores,
-                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
-    KernelHandle writer_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp",
-                                                        all_cores,
-                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
-    // int32_t padded_row_diff_size_nbytes = padded_row_size_nbytes - unpadded_row_size_nbytes;
-    log_rt_args(CoreCoord{0, 0}, reader_ct_args);
-
-    #if 1
-    {
-        tt::log_debug("ncores: {}", ncores);
-        tt::log_debug("ncores_h: {}", ncores_h);
-        tt::log_debug("ncores_w: {}", ncores_w);
-        tt::log_debug("ntiles_per_core_h: {}", ntiles_per_core_h);
-        tt::log_debug("ntiles_per_core_w: {}", ntiles_per_core_w);
-        tt::log_debug("src0_buffer_addr: {}", src0_buffer->address());
-        tt::log_debug("dst_buffer_addr: {}", dst_buffer->address());
-        tt::log_debug("a.shape[0]: {}", a.get_legacy_shape()[0]);
-        tt::log_debug("out.shape[0]: {}", output_shape[0]);
-        tt::log_debug("a.shape[1]: {}", a.get_legacy_shape()[1]);
-        tt::log_debug("out.shape[1]: {}", output_shape[1]);
-        tt::log_debug("a.shape[2]: {}", a.get_legacy_shape()[2]);
-        tt::log_debug("out.shape[2]: {}", output_shape[2]);
-        tt::log_debug("s.shape[3]: {}", a.get_legacy_shape()[3]);
-        tt::log_debug("out.shape[3]: {}", output_shape[3]);
-        tt::log_debug("unpadded_row_size_nbytes: {}", unpadded_row_size_nbytes);
-        tt::log_debug("padded_row_size_nbytes: {}", padded_row_size_nbytes);
-        // tt::log_debug("padded_row_diff_size_nbytes: {}", padded_row_diff_size_nbytes);
-        tt::log_debug("pad_value_const_tensor_addr: {}", pad_value_const_tensor_addr);
-        tt::log_debug("pad_value_const_buffer_nbytes: {}", pad_value_const_buffer_nbytes);
-        tt::log_debug("packed_pad_value: {}", packed_pad_value);
-        tt::log_debug("src_nbytes_per_core_w: {}", src_nbytes_per_core_w);
-        tt::log_debug("dst_nbytes_per_core_w: {}", dst_nbytes_per_core_w);
-        tt::log_debug("nbatch_per_core_h: {}", nbatch_per_core_h);
-        tt::log_debug("ncores_per_batch_h: {}", ncores_per_batch_h);
-    }
-    #endif
-
-    uint32_t start_src_stick_id = 0;
-    uint32_t start_dst_stick_id = 0;
-    uint32_t start_src_stick_wi = 0; // start of stick segment for 2d decomp
-    uint32_t start_dst_stick_wi = 0;
-    int32_t local_nsticks = ntiles_per_core_h * TILE_HEIGHT;
-    int32_t rem_nbatch = nbatch;    // per core h, there are ncores_per_batch_h cores, ie each batch ncores_h = ncores_per_batch_h
-    for (int32_t b = 0; b < nbatch; ++ b) {
-        int32_t rem_src_nsticks = a.get_legacy_shape()[2];
-        for (uint32_t j = 0; j < ncores_per_batch_h; ++ j) {
-            uint32_t num_local_unpadded_nsticks = local_nsticks;
-            if (rem_src_nsticks - local_nsticks >= 0) {
-                // not reached padding sticks yet
-                rem_src_nsticks -= local_nsticks;
-            } else {
-                num_local_unpadded_nsticks = rem_src_nsticks;
-                rem_src_nsticks = 0;
-            }
-            start_src_stick_wi = 0;
-            start_dst_stick_wi = 0;
-            int32_t rem_src_stick_size_nbytes = unpadded_row_size_nbytes;
-            for (uint32_t i = 0; i < ncores_w; ++ i) {
-                CoreCoord core = {i, b * ncores_per_batch_h + j};
-                uint32_t curr_stick_size_nbytes = 0;
-                int32_t curr_stick_diff_nbytes = 0;
-                if (rem_src_stick_size_nbytes - dst_nbytes_per_core_w >= 0) {
-                    // no padding on this core
-                    curr_stick_size_nbytes = dst_nbytes_per_core_w;
-                    rem_src_stick_size_nbytes -= dst_nbytes_per_core_w;
-                } else {
-                    // this core has padding
-                    curr_stick_size_nbytes = rem_src_stick_size_nbytes;
-                    curr_stick_diff_nbytes = dst_nbytes_per_core_w - curr_stick_size_nbytes;
-                    rem_src_stick_size_nbytes = 0;
-                }
-                vector<uint32_t> reader_rt_args = {src0_buffer->address(),
-                                                    dst_buffer->address(),
-                                                    a.get_legacy_shape()[0],
-                                                    output_shape[0],
-                                                    a.get_legacy_shape()[1],
-                                                    output_shape[1],
-                                                    a.get_legacy_shape()[2],
-                                                    output_shape[2],
-                                                    a.get_legacy_shape()[3],
-                                                    output_shape[3],
-                                                    curr_stick_size_nbytes,
-                                                    (uint32_t) dst_nbytes_per_core_w,
-                                                    (uint32_t) curr_stick_diff_nbytes,
-                                                    pad_value_const_tensor_addr,
-                                                    pad_value_const_buffer_nbytes,
-                                                    packed_pad_value,
-                                                    start_src_stick_id,
-                                                    start_dst_stick_id,
-                                                    start_src_stick_wi,
-                                                    start_dst_stick_wi,
-                                                    start_src_stick_wi * a.element_size(),
-                                                    (uint32_t) local_nsticks,
-                                                    num_local_unpadded_nsticks,
-                                                    unpadded_row_size_nbytes,
-                                                    padded_row_size_nbytes,
-                                                    start_dst_stick_wi * output.element_size(),
-                                                    nbatch_per_core_h
-                                                    };
-                // if (core.x == 0) log_rt_args(core, reader_rt_args);
-                // if (core.x == 0) {
-                //     log_debug("{} :: start_src_stick_id: {}", core.y, start_src_stick_id);
-                //     log_debug("{} :: start_dst_stick_id: {}", core.y, start_dst_stick_id);
-                //     log_debug("{} :: local_nsticks: {}", core.y, local_nsticks);
-                //     log_debug("{} :: num_local_unpadded_nsticks: {}", core.y, num_local_unpadded_nsticks);
-                //     log_debug("{} :: nbatch_per_core_h: {}", core.y, nbatch_per_core_h);
-                //     log_debug("{} :: ncores_per_batch_h: {}", core.y, ncores_per_batch_h);
-                // }
-                vector<uint32_t> writer_rt_args = reader_rt_args;
-                tt::tt_metal::SetRuntimeArgs(program,
-                                reader_kernel_id,
-                                core,
-                                reader_rt_args);
-                tt::tt_metal::SetRuntimeArgs(program,
-                                writer_kernel_id,
-                                core,
-                                writer_rt_args);
-                start_src_stick_wi += ntiles_per_core_w * TILE_WIDTH;
-                start_dst_stick_wi += ntiles_per_core_w * TILE_WIDTH;
-            } // for ncores_w
-            start_src_stick_id += num_local_unpadded_nsticks;
-            start_dst_stick_id += local_nsticks;
-        } // for ncores_h
-    }
-
-    auto override_runtime_args_callback = [reader_kernel_id = reader_kernel_id,
-                                           writer_kernel_id = writer_kernel_id,
-                                           ncores_h = ncores_h,
-                                           ncores_w = ncores_w](
-                                              const Program &program,
-                                              const std::vector<Buffer *> &input_buffers,
-                                              const std::vector<Buffer *> &output_buffers) {
-        auto src_buffer = input_buffers.at(0);
-        auto dst_buffer = output_buffers.at(0);
-
-        for (uint32_t j = 0; j < ncores_h; ++ j) {
-            for (uint32_t i = 0; i < ncores_w; ++ i) {
-                CoreCoord core = {i, j};
-                {
-                    auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, reader_kernel_id, core);
-                    runtime_args[0] = src_buffer->address();
-                    runtime_args[1] = dst_buffer->address();
-                }
-                {
-                    auto &runtime_args = tt::tt_metal::GetRuntimeArgs(program, writer_kernel_id, core);
-                    runtime_args[0] = src_buffer->address();
-                    runtime_args[1] = dst_buffer->address();
-                }
-            }
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-
-std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t> > > get_runtime_args_rm(const Tensor &input_tensor,
-                                                                                        Tensor &output_tensor,
-                                                                                        uint32_t num_cores_total,
-                                                                                        uint32_t num_cores,
-                                                                                        uint32_t num_cores_y,
-                                                                                        CoreRangeSet core_group_1,
-                                                                                        uint32_t num_w_sticks_per_core_group_1,
-                                                                                        CoreRangeSet core_group_2,
-                                                                                        uint32_t num_w_sticks_per_core_group_2
-                                                                                        ){
-
-    auto input_buffer = input_tensor.buffer();
-    auto output_buffer = output_tensor.buffer();
-    auto input_shape = input_tensor.get_legacy_shape();
-    auto output_shape = output_tensor.get_legacy_shape();
-
-    uint32_t W = input_shape[3], H = input_shape[2], C = input_shape[1], N = input_shape[0];
-    uint32_t W_bytes = W * input_tensor.element_size();
-
-    uint32_t W_padded = output_shape[3], H_padded = output_shape[2], C_padded = output_shape[1], N_padded = output_shape[0];
-    uint32_t W_padded_bytes = W_padded * input_tensor.element_size();
-
-    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
-    std::vector<uint32_t> start_dim_offset(num_dims, 0);
-
-
-    std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t> > > ret_val(num_cores_total);
-
-
-    uint32_t max_read_size = 2048;
-    uint32_t curr_c = 0, curr_h = 0, curr_n = 0;
-    for(uint32_t i = 0, curr_sticks_read = 0, curr_sticks_write = 0; i < num_cores_total; i++) {
-        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+                                                                const float pad_value); 
 
 
-        uint32_t num_sticks_per_core;
-        if (core_group_1.core_coord_in_core_ranges(core)) {
-            num_sticks_per_core = num_w_sticks_per_core_group_1;
-        } else if (core_group_2.core_coord_in_core_ranges(core)) {
-            num_sticks_per_core = num_w_sticks_per_core_group_2;
-        } else {
-            //no-op
-            num_sticks_per_core = 0;
-        }
-
-
-        // issue more reads before calling barrier
-        uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
-        if (num_sticks_per_core != 0) {
-            num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core, W_bytes, max_read_size);
-            num_read_per_barrier = num_sticks_per_core / num_sticks_per_core_read;
-        }
-
-        // reader
-        std::vector<uint32_t> reader_runtime_args = {
-            input_buffer->address(),
-            num_sticks_per_core_read,
-            num_read_per_barrier,
-            curr_sticks_read,
-        };
-        reader_runtime_args.insert(reader_runtime_args.end(), start_dim_offset.begin(), start_dim_offset.end());
-
-        // writer
-        std::vector<uint32_t> writer_runtime_args = {
-            output_buffer->address(),
-            num_sticks_per_core_read,
-            num_read_per_barrier,
-            curr_sticks_write
-        };
-
-        ret_val[i] = {reader_runtime_args, writer_runtime_args};
-
-        curr_sticks_write += num_sticks_per_core;
-
-        for (uint32_t i = 0; i < num_sticks_per_core; ++i) {
-
-            if (curr_h < H and curr_c < C and curr_n < N) {
-                curr_sticks_read++;
-            }
-
-            curr_h++;
-            if (curr_h == H_padded) {
-                curr_c++;
-                curr_h = 0;
-                if (curr_c == C_padded) {
-                    curr_n++;
-                    curr_c = 0;
-                }
-            }
-        }
-
-        start_dim_offset = {0, curr_h, curr_c, curr_n};
-
-    }
-
-    return ret_val;
-}
 
 operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(const Tensor &a,
                                                                 Tensor &output,
                                                                 const tt::tt_metal::Shape &output_tensor_shape,
                                                                 const tt::tt_metal::Shape &input_tensor_start,
-                                                                const float pad_value) {
-    Program program{};
-
-    auto output_shape = output_tensor_shape;
-    uint32_t W = a.shape()[3], H = a.shape()[2], C = a.shape()[1], N = a.shape()[0];
-    uint32_t NCH = H * C * N;
-    uint32_t W_padded = output_tensor_shape[3], H_padded = output_tensor_shape[2], C_padded = output_tensor_shape[1], N_padded = output_tensor_shape[0];
-    uint32_t NCH_padded = H_padded * C_padded * N_padded;
-
-    auto stick_size = W * a.element_size();
-    auto stick_size_padded = W_padded * a.element_size();
-    auto rem_stick_size_padded = stick_size_padded - stick_size;
-    uint32_t row_major_min_bytes = 16;
-
-    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-
-    Device *device = a.device();
-
-    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
-    uint32_t num_cores_x = compute_with_storage_grid_size.x;
-    uint32_t num_cores_y = compute_with_storage_grid_size.y;
-    uint32_t num_cores_total = num_cores_x * num_cores_y;
-    CoreRange total_cores({0, 0}, {num_cores_x-1, num_cores_y-1});
-
-    auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_padded_per_core_group_1, num_sticks_padded_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, NCH_padded);
-
-    uint32_t src0_cb_index = 0;
-    auto num_sticks = num_sticks_padded_per_core_group_1 > num_sticks_padded_per_core_group_2 ? num_sticks_padded_per_core_group_1 : num_sticks_padded_per_core_group_2;
-
-    tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded, {{src0_cb_index, cb_data_format}})
-        .set_page_size(src0_cb_index, stick_size_padded);
-    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
-
-    // construct const buffer with the pad_value
-    bool not_pad_by_zero = pad_value != 0;
-    if (not_pad_by_zero) {
-        uint32_t src1_cb_index = 1;
-        tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(row_major_min_bytes, {{src1_cb_index, cb_data_format}})
-            .set_page_size(src1_cb_index, row_major_min_bytes);
-        auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config);
-    }
-
-    Buffer *src0_buffer = a.buffer();
-    Buffer *dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    bool src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    bool src_stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
-    uint32_t src_log2_stick_size = src_stick_size_is_power_of_two ? (std::uint32_t) std::log2(stick_size) : 0;
-    bool dst_stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size_padded);
-    uint32_t dst_log2_stick_size = dst_stick_size_is_power_of_two ? (std::uint32_t) std::log2(stick_size_padded) : 0;
-    std::vector<uint32_t> reader_ct_args = {(std::uint32_t) src0_is_dram,
-                                            (std::uint32_t) N,
-                                            (std::uint32_t) H,
-                                            (std::uint32_t) C,
-                                            (std::uint32_t) stick_size,
-                                            (std::uint32_t) N_padded,
-                                            (std::uint32_t) H_padded,
-                                            (std::uint32_t) C_padded,
-                                            (std::uint32_t) stick_size_padded,
-                                            (std::uint32_t) (stick_size_padded - stick_size),
-                                            (std::uint32_t) not_pad_by_zero,
-                                            (std::uint32_t) packed_pad_value,
-                                            (std::uint32_t) row_major_min_bytes,
-                                            (std::uint32_t) (rem_stick_size_padded / row_major_min_bytes),
-                                            (std::uint32_t) (stick_size_padded / row_major_min_bytes),
-                                            (std::uint32_t) src_stick_size_is_power_of_two,
-                                            (std::uint32_t) src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size};
-    std::vector<uint32_t> writer_ct_args = {(std::uint32_t) src0_cb_index,
-                                            (std::uint32_t) dst_is_dram,
-                                            (std::uint32_t) stick_size_padded,
-                                            (std::uint32_t) dst_stick_size_is_power_of_two,
-                                            (std::uint32_t) dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded};
-
-    KernelHandle reader_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp",
-                                                        total_cores,
-                                                        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
-    KernelHandle writer_kernel_id = CreateKernel(program,
-                                                        "ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp",
-                                                        total_cores,
-                                                        tt::tt_metal::WriterDataMovementConfig(writer_ct_args));
-
-    auto all_runtime_args = get_runtime_args_rm(a, output, num_cores_total, num_cores, num_cores_y, core_group_1, num_sticks_padded_per_core_group_1, core_group_2, num_sticks_padded_per_core_group_2);
-
-    for(uint32_t i = 0; i < num_cores_total; i++) {
-        CoreCoord core = {i / num_cores_y, i % num_cores_y};
-        tt::tt_metal::SetRuntimeArgs(
-            program,
-            reader_kernel_id,
-            core,
-            all_runtime_args[i].first
-        );
-
-        tt::tt_metal::SetRuntimeArgs(
-            program,
-            writer_kernel_id,
-            core,
-            all_runtime_args[i].second
-
-        );
-    }
-
-    auto override_runtime_args_callback = [
-            reader_kernel_id,
-            writer_kernel_id,
-            compute_with_storage_grid_size
-        ]
-    (
-        const void* operation,
-        const Program& program,
-        const std::vector<Tensor>& input_tensors,
-        const std::vector<std::optional<const Tensor>>&,
-        const std::vector<Tensor>& output_tensors
-    ) {
-        auto src_tensor = input_tensors.at(0);
-
-        auto dst_tensor = output_tensors.at(0);
-
-        uint32_t num_cores_x = compute_with_storage_grid_size.x;
-        uint32_t num_cores_y = compute_with_storage_grid_size.y;
-
-        uint32_t num_cores_total = num_cores_x * num_cores_y;
-
-        auto output_tensor_shape = dst_tensor.shape();
-        uint32_t W_padded = output_tensor_shape[3], H_padded = output_tensor_shape[2], C_padded = output_tensor_shape[1], N_padded = output_tensor_shape[0];
-        uint32_t NCH_padded = H_padded * C_padded * N_padded;
-
-        auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_padded_per_core_group_1, num_sticks_padded_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, NCH_padded);
-        auto all_runtime_args = get_runtime_args_rm(src_tensor, dst_tensor, num_cores_total, num_cores, num_cores_y, core_group_1, num_sticks_padded_per_core_group_1, core_group_2, num_sticks_padded_per_core_group_2);
-
-        for(uint32_t i = 0; i < num_cores_total; i++) {
-            CoreCoord core = {i / num_cores_y, i % num_cores_y};
-
-            {
-                SetRuntimeArgs(program, reader_kernel_id, core, all_runtime_args[i].first);
-            }
-
-            {
-                SetRuntimeArgs(program, writer_kernel_id, core, all_runtime_args[i].second);
-            }
-        }
-    };
+                                                                const float pad_value); 
 
-    return {.program=std::move(program), .override_runtime_arguments_callback=override_runtime_args_callback};
-}
 
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
index 18f310fa4e7..2f109e2ced5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
@@ -120,7 +120,7 @@ operation::ProgramWithCallbacks Slice::create_program(
     const auto &input_tensor_a = input_tensors.at(0);
     auto &output_tensor = output_tensors.at(0);
 
-    return slice_multi_core(input_tensor_a, output_tensor, this->slice_start, this->slice_end);
+    return detail::slice_multi_core(input_tensor_a, output_tensor, this->slice_start, this->slice_end);
 }
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
new file mode 100644
index 00000000000..0c2c44a1ea3
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
@@ -0,0 +1,545 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "optional"
+#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+
+#include "slice_op.hpp"
+using namespace tt::constants;
+
+
+namespace ttnn::operations::data_movement::detail {
+
+inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_slice_runtime_args_rm(
+    const Tensor& input_tensor,
+    Tensor& output_tensor,
+    const tt::tt_metal::Shape& output_tensor_start,
+    uint32_t num_cores_total,
+    uint32_t num_cores,
+    uint32_t num_cores_y,
+    CoreRangeSet core_group_1,
+    CoreRangeSet core_group_2,
+    uint32_t num_sticks_per_core_group_1,
+    uint32_t num_sticks_per_core_group_2,
+    uint32_t max_read_size) {
+    tt::tt_metal::Device* device = input_tensor.device();
+
+    auto input_buffer = input_tensor.buffer();
+    auto output_buffer = output_tensor.buffer();
+    auto input_shape = input_tensor.get_legacy_shape();
+    auto output_shape = output_tensor.get_legacy_shape();
+
+    uint32_t padded_row_size_bytes = input_shape[-1] * input_tensor.element_size();
+    uint32_t unpadded_row_size_bytes = output_shape[-1] * input_tensor.element_size();
+
+    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
+    std::vector<uint32_t> num_unpadded_sticks_per_dim(num_dims);
+    std::vector<uint32_t> num_padded_sticks_per_dim(num_dims);
+    std::vector<uint32_t> id_per_dim(num_dims);
+
+    std::vector<uint32_t> accumulated_total_per_dim(num_dims);
+
+    // TODO: Remove first element of these arrays and update kernel accordingly
+    // This currently just matches tile version where we iterate over the row as well
+    num_unpadded_sticks_per_dim[0] = 1;
+    num_padded_sticks_per_dim[0] = 0;
+    accumulated_total_per_dim[0] = 1;
+
+    for (int32_t i = 1; i < num_dims; i++) {
+        uint32_t num_unpadded_dim = output_shape[-(i + 1)];
+        uint32_t num_total_dim = input_shape[-(i + 1)];
+        uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1];
+        num_unpadded_sticks_per_dim[i] = num_unpadded_dim;
+        num_padded_sticks_per_dim[i] = num_padded_dim;
+        accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1];
+    }
+
+    uint32_t unpadded_row_size_bytes_offset = tt::round_up(unpadded_row_size_bytes, TILE_WIDTH / 2);
+
+    vector<uint32_t> common_reader_kernel_args = {
+        input_tensor.buffer()->address() + output_tensor_start[-1] * output_tensor.element_size(),
+        padded_row_size_bytes,
+        unpadded_row_size_bytes,
+        unpadded_row_size_bytes_offset,
+        num_dims,
+        0,
+        0,
+        0,
+        0};
+    common_reader_kernel_args.insert(
+        common_reader_kernel_args.end(), num_unpadded_sticks_per_dim.begin(), num_unpadded_sticks_per_dim.end());
+    common_reader_kernel_args.insert(
+        common_reader_kernel_args.end(), num_padded_sticks_per_dim.begin(), num_padded_sticks_per_dim.end());
+
+    std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> ret_val(num_cores_total);
+
+    uint32_t start_offset = ttnn::operations::data_movement::get_rm_start_offset(input_tensor, ttnn::Shape(output_tensor_start));
+    for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        uint32_t num_sticks_per_core;
+        if (core_group_1.core_coord_in_core_ranges(core)) {
+            num_sticks_per_core = num_sticks_per_core_group_1;
+        } else if (core_group_2.core_coord_in_core_ranges(core)) {
+            num_sticks_per_core = num_sticks_per_core_group_2;
+        } else {
+            // no-op
+            num_sticks_per_core = 0;
+        }
+
+        // issue more reads before calling barrier
+        uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
+        if (num_sticks_per_core != 0) {
+            auto num_sticks_per_core_pad32 = num_sticks_per_core + (32 - num_sticks_per_core % 32) % 32;
+            num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core_pad32, unpadded_row_size_bytes_offset, max_read_size);
+            num_read_per_barrier = num_sticks_per_core_pad32 / num_sticks_per_core_read;
+        }
+
+        id_per_dim[0] = num_sticks_written % num_unpadded_sticks_per_dim[0];
+        uint32_t unpadded_written = num_sticks_written / num_unpadded_sticks_per_dim[0];
+        uint32_t start_id = id_per_dim[0] + start_offset;
+
+        for (uint32_t j = 1; j < num_dims; j++) {
+            id_per_dim[j] = unpadded_written % num_unpadded_sticks_per_dim[j];
+            unpadded_written = unpadded_written / num_unpadded_sticks_per_dim[j];
+            start_id += id_per_dim[j] * accumulated_total_per_dim[j - 1];
+        }
+        vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
+        //
+        uint32_t addr_offset = 5;  // input buffer addr, padded_row_size_bytes, unpadded_row_size_bytes, num_dims
+        reader_kernel_args[addr_offset++] = start_id;
+        reader_kernel_args[addr_offset++] = num_sticks_per_core;
+        reader_kernel_args[addr_offset++] = num_sticks_per_core_read;
+        reader_kernel_args[addr_offset] = num_read_per_barrier;
+        reader_kernel_args.insert(reader_kernel_args.end(), id_per_dim.begin(), id_per_dim.end());
+
+        vector<uint32_t> writer_kernel_args = {
+            output_buffer->address(), unpadded_row_size_bytes, unpadded_row_size_bytes_offset, num_sticks_per_core, num_sticks_per_core_read, num_read_per_barrier, num_sticks_written, 0};
+        num_sticks_written += num_sticks_per_core;
+        ret_val[i] = {reader_kernel_args, writer_kernel_args};
+    }
+
+    return ret_val;
+}
+
+operation::ProgramWithCallbacks slice_rm_multi_core(
+    const Tensor& a, Tensor& output, const tt::tt_metal::Shape& output_tensor_start, const tt::tt_metal::Shape& output_tensor_end) {
+    const tt::tt_metal::Shape output_shape = output.get_legacy_shape();
+
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device* device = a.device();
+
+    uint32_t num_unpadded_sticks = output.volume() / output.get_legacy_shape()[-1];
+
+    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+    CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1});
+    uint32_t num_cores_total = num_cores_x * num_cores_y;
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_per_core_group_1, num_sticks_per_core_group_2] =
+        split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks);
+
+    tt::tt_metal::Buffer* src0_buffer = a.buffer();
+
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+
+    uint32_t padded_row_size_bytes = a.get_legacy_shape()[-1] * a.element_size();
+    uint32_t unpadded_row_size_bytes = output_shape[-1] * a.element_size();
+
+    tt::tt_metal::Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> reader_compile_time_args_vec = {(std::uint32_t)src0_is_dram};
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+
+    uint32_t src_stick_size = padded_row_size_bytes;
+    uint32_t dst_stick_size = unpadded_row_size_bytes;
+
+    uint32_t src0_cb_index = 0;
+    uint32_t max_read_size = 4096;
+    uint32_t cb_page_size = dst_is_dram ? tt::round_up(unpadded_row_size_bytes, TILE_WIDTH) : tt::round_up(unpadded_row_size_bytes, TILE_WIDTH / 2);
+    uint32_t num_input_pages = num_sticks_per_core_group_1 > num_sticks_per_core_group_2 ? num_sticks_per_core_group_1 : num_sticks_per_core_group_2;
+    uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
+    if (num_input_pages != 0) {
+        auto num_sticks_per_core_pad32 = num_input_pages + (32 - num_input_pages % 32) % 32;
+        num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core_pad32, cb_page_size, max_read_size);
+        num_read_per_barrier = num_sticks_per_core_pad32 / num_sticks_per_core_read;
+    }
+    tt::tt_metal::CircularBufferConfig cb_src0_config =
+        tt::tt_metal::CircularBufferConfig(num_read_per_barrier * 2 * cb_page_size, {{src0_cb_index, cb_data_format}})
+            .set_page_size(src0_cb_index, cb_page_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
+
+
+    std::vector<uint32_t> writer_compile_time_args_vec = {(std::uint32_t)src0_cb_index, (std::uint32_t)dst_is_dram};
+
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp",
+        total_cores,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args_vec));
+
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp",
+        total_cores,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args_vec));
+
+    auto all_runtime_args = get_slice_runtime_args_rm(
+        a,
+        output,
+        output_tensor_start,
+        num_cores_total,
+        num_cores,
+        num_cores_y,
+        core_group_1,
+        core_group_2,
+        num_sticks_per_core_group_1,
+        num_sticks_per_core_group_2,
+        max_read_size);
+
+    for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first);
+
+        tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second);
+    }
+
+    auto override_runtime_args_callback =
+        [unary_reader_kernel_id, unary_writer_kernel_id, compute_with_storage_grid_size, max_read_size](
+            const void* operation,
+            const Program& program,
+            const std::vector<Tensor>& input_tensors,
+            const std::vector<std::optional<const Tensor>>&,
+            const std::vector<Tensor>& output_tensors) {
+            auto src_tensor = input_tensors.at(0);
+            auto dst_tensor = output_tensors.at(0);
+            uint32_t num_cores_x = compute_with_storage_grid_size.x;
+            uint32_t num_cores_y = compute_with_storage_grid_size.y;
+            uint32_t num_cores_total = num_cores_x * num_cores_y;
+            uint32_t num_unpadded_sticks = dst_tensor.volume() / dst_tensor.get_legacy_shape()[-1];
+            auto
+                [num_cores,
+                 all_cores,
+                 core_group_1,
+                 core_group_2,
+                 num_sticks_per_core_group_1,
+                 num_sticks_per_core_group_2] =
+                    split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks);
+
+            const auto tensor_start = static_cast<const ttnn::operations::data_movement::Slice *>(operation)->slice_start;
+            auto all_runtime_args = get_slice_runtime_args_rm(
+                src_tensor,
+                dst_tensor,
+                tensor_start,
+                num_cores_total,
+                num_cores,
+                num_cores_y,
+                core_group_1,
+                core_group_2,
+                num_sticks_per_core_group_1,
+                num_sticks_per_core_group_2,
+                max_read_size);
+
+            for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; i++) {
+                CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+                { SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); }
+
+                { SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); }
+            }
+        };
+
+    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
+}
+
+template <bool initialize_args>
+inline __attribute__((always_inline)) void set_slice_runtime_args_tile(
+    const Tensor& input_tensor,
+    const Tensor& output_tensor,
+    const tt::tt_metal::Shape& output_tensor_start,
+    const uint32_t& num_cores_total,
+    const uint32_t& num_cores,
+    const std::vector<CoreCoord>& cores,
+    const uint32_t& num_cores_group_1,
+    const uint32_t& num_cores_group_2,
+    const uint32_t& num_tiles_per_core_group_1,
+    const uint32_t& num_tiles_per_core_group_2,
+    const Program& program,
+    const tt::tt_metal::KernelHandle& unary_reader_kernel_id,
+    const tt::tt_metal::KernelHandle& unary_writer_kernel_id,
+    std::vector<uint32_t>& accumulated_total_per_dim) {
+    const auto input_buffer = input_tensor.buffer();
+    const auto output_buffer = output_tensor.buffer();
+    const auto& input_shape = input_tensor.get_legacy_shape();
+    const auto& output_shape = output_tensor.get_legacy_shape();
+
+    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
+
+    uint32_t num_unpadded_Xt = output_shape[-1] / TILE_WIDTH;
+    uint32_t num_total_Xt = input_shape[-1] / TILE_WIDTH;
+    uint32_t num_padded_Xt = num_total_Xt - num_unpadded_Xt;
+    uint32_t num_unpadded_Yt = output_shape[-2] / TILE_HEIGHT;
+    uint32_t num_total_Yt = input_shape[-2] / TILE_HEIGHT;
+    uint32_t num_padded_Yt = (num_total_Yt - num_unpadded_Yt) * num_total_Xt;
+
+    const auto set_common_reader_args = [&](
+        uint32_t* reader_common_args,
+        uint32_t* num_unpadded_tiles_per_dim,
+        uint32_t* num_padded_tiles_per_dim) __attribute__((always_inline)) {
+        reader_common_args[0] = input_buffer->address();
+        num_unpadded_tiles_per_dim[0] = num_unpadded_Xt;
+        num_unpadded_tiles_per_dim[1] = num_unpadded_Yt;
+        num_padded_tiles_per_dim[0] = num_padded_Xt;
+        num_padded_tiles_per_dim[1] = num_padded_Yt;
+        accumulated_total_per_dim[0] = num_total_Xt;
+        accumulated_total_per_dim[1] = num_total_Yt * num_total_Xt;
+        for (int32_t i = 2; i < num_dims; ++i) {
+            uint32_t num_unpadded_dim = output_shape[-(i + 1)];
+            uint32_t num_total_dim = input_shape[-(i + 1)];
+            uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1];
+            num_unpadded_tiles_per_dim[i] = num_unpadded_dim;
+            num_padded_tiles_per_dim[i] = num_padded_dim;
+            accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1];
+        }
+    };
+
+    const auto set_reader_rt_args = [&](
+        uint32_t* reader_rt_args,
+        const uint32_t* num_unpadded_tiles_per_dim,
+        const uint32_t* num_padded_tiles_per_dim,
+        const uint32_t& num_tiles_per_core,
+        const uint32_t& start_offset,
+        const uint32_t& num_tiles_written) __attribute__((always_inline)) {
+        reader_rt_args[2] = num_tiles_written % num_unpadded_tiles_per_dim[0];
+        uint32_t unpadded_written = num_tiles_written / num_unpadded_tiles_per_dim[0];
+        uint32_t start_id = reader_rt_args[2] + start_offset;
+        for (uint32_t j = 1; j < num_dims; ++j) {
+            reader_rt_args[2 + j] = unpadded_written % num_unpadded_tiles_per_dim[j];
+            unpadded_written = unpadded_written / num_unpadded_tiles_per_dim[j];
+            start_id += reader_rt_args[2 + j] * accumulated_total_per_dim[j - 1];
+        }
+        reader_rt_args[0] = start_id;
+        reader_rt_args[1] = num_tiles_per_core;
+    };
+
+    if constexpr (initialize_args) {
+        std::vector<uint32_t> reader_common_args(1 + num_dims * 2);
+        uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1;
+        uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims;
+        set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim);
+        SetCommonRuntimeArgs(program, unary_reader_kernel_id, reader_common_args);
+    }
+    auto& reader_common_args = GetCommonRuntimeArgs(program, unary_reader_kernel_id);
+    uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1;
+    uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims;
+    if constexpr (!initialize_args) {
+        set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim);
+    }
+
+    uint32_t start_offset = ttnn::operations::data_movement::get_tiled_start_offset(input_tensor, ttnn::Shape(output_tensor_start));
+
+    auto& reader_kernel_args_by_core = GetRuntimeArgs(program, unary_reader_kernel_id);
+    auto& writer_kernel_args_by_core = GetRuntimeArgs(program, unary_writer_kernel_id);
+    const uint32_t num_used_cores = num_cores_group_1 + num_cores_group_2;
+    for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; ++i) {
+        const CoreCoord& core = cores[i];
+        uint32_t num_tiles_per_core;
+        if (i < num_cores_group_1) {
+            num_tiles_per_core = num_tiles_per_core_group_1;
+        } else if (i < num_used_cores) {
+            num_tiles_per_core = num_tiles_per_core_group_2;
+        } else {
+            // no-op
+            if constexpr (initialize_args) {
+                std::vector<uint32_t> reader_kernel_args(2 + num_dims, 0);
+                std::vector<uint32_t> writer_kernel_args(3, 0);
+                tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
+                tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args);
+            } else {
+                auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y];
+                reader_kernel_args[1] = 0;
+                auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y];
+                writer_kernel_args[1] = 0;
+            }
+            continue;
+        }
+
+        if constexpr (initialize_args) {
+            std::vector<uint32_t> reader_kernel_args(2 + num_dims);
+            set_reader_rt_args(
+                reader_kernel_args.data(),
+                num_unpadded_tiles_per_dim,
+                num_padded_tiles_per_dim,
+                num_tiles_per_core,
+                start_offset,
+                num_tiles_written);
+            SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
+        } else {
+            auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y];
+            set_reader_rt_args(
+                reader_kernel_args.data(),
+                num_unpadded_tiles_per_dim,
+                num_padded_tiles_per_dim,
+                num_tiles_per_core,
+                start_offset,
+                num_tiles_written);
+        }
+
+        if constexpr (initialize_args) {
+            vector<uint32_t> writer_kernel_args = {output_buffer->address(), num_tiles_per_core, num_tiles_written};
+            tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args);
+        } else {
+            auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y];
+            writer_kernel_args[0] = output_buffer->address();
+            writer_kernel_args[1] = num_tiles_per_core;
+            writer_kernel_args[2] = num_tiles_written;
+        }
+        num_tiles_written += num_tiles_per_core;
+    }
+}
+
+operation::ProgramWithCallbacks slice_tile_multi_core(
+    const Tensor& a, Tensor& output, const tt::tt_metal::Shape& output_tensor_start, const tt::tt_metal::Shape& output_tensor_end) {
+    const tt::tt_metal::Shape output_shape = output.get_legacy_shape();
+
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device* device = a.device();
+
+    uint32_t num_unpadded_tiles = output.volume() / TILE_HW;
+
+    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+    auto num_cores_total = num_cores_x * num_cores_y;
+    CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1});
+
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
+        split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles);
+
+    tt::tt_metal::Buffer* src0_buffer = a.buffer();
+
+    tt::tt_metal::Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t single_tile_size = tt::tt_metal::detail::TileSize(cb_data_format);
+
+    uint32_t src0_cb_index = 0;
+    uint32_t num_input_tiles = 2;
+    tt::tt_metal::CircularBufferConfig cb_src0_config =
+        tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
+            .set_page_size(src0_cb_index, single_tile_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
+
+    std::uint32_t num_dims = static_cast<std::uint32_t>(a.get_legacy_shape().rank());
+
+    // Reader compile-time args
+    // Data is 32 byte aligned
+    bool src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> reader_compile_time_args = {
+        static_cast<uint32_t>(src0_cb_index),
+        static_cast<uint32_t>(num_dims),
+        static_cast<uint32_t>(src0_is_dram),
+    };
+    std::vector<uint32_t> writer_compile_time_args = {
+        static_cast<uint32_t>(src0_cb_index), static_cast<uint32_t>(dst_is_dram)};
+
+    // Tilized reader
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/reader_unary_unpad_dims_interleaved_start_id.cpp",
+        total_cores,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
+        total_cores,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    const auto cores = grid_to_cores(num_cores_total, num_cores_x, num_cores_y, false);
+
+    std::vector<uint32_t> accumulated_total_per_dim(num_dims);
+    set_slice_runtime_args_tile<true>(
+        a,
+        output,
+        output_tensor_start,
+        num_cores_total,
+        num_cores,
+        cores,
+        core_group_1.num_cores(),
+        core_group_2.num_cores(),
+        num_tiles_per_core_group_1,
+        num_tiles_per_core_group_2,
+        program,
+        unary_reader_kernel_id,
+        unary_writer_kernel_id,
+        accumulated_total_per_dim);
+
+    auto override_runtime_args_callback = [unary_reader_kernel_id,
+                                           unary_writer_kernel_id,
+                                           compute_with_storage_grid_size,
+                                           cores,
+                                           accumulated_total_per_dim](
+                                              const void* operation,
+                                              const Program& program,
+                                              const std::vector<Tensor>& input_tensors,
+                                              const std::vector<std::optional<const Tensor>>&,
+                                              const std::vector<Tensor>& output_tensors) mutable {
+        const Tensor& src_tensor = input_tensors[0];
+        const Tensor& dst_tensor = output_tensors[0];
+        uint32_t num_unpadded_tiles = dst_tensor.volume() / TILE_HW;
+
+        uint32_t num_cores_x = compute_with_storage_grid_size.x;
+        uint32_t num_cores_y = compute_with_storage_grid_size.y;
+        uint32_t num_cores_total = cores.size();
+
+        auto
+            [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
+                split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles);
+
+        const auto& tensor_start = static_cast<const ttnn::operations::data_movement::Slice*>(operation)->slice_start;
+        set_slice_runtime_args_tile<false>(
+            src_tensor,
+            dst_tensor,
+            tensor_start,
+            num_cores_total,
+            num_cores,
+            cores,
+            core_group_1.num_cores(),
+            core_group_2.num_cores(),
+            num_tiles_per_core_group_1,
+            num_tiles_per_core_group_2,
+            program,
+            unary_reader_kernel_id,
+            unary_writer_kernel_id,
+            accumulated_total_per_dim);
+    };
+
+    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks slice_multi_core(
+    const Tensor& a, Tensor& output, const tt::tt_metal::Shape& output_tensor_start, const tt::tt_metal::Shape& output_tensor_end) {
+    switch (a.get_layout()) {
+        case Layout::ROW_MAJOR: return slice_rm_multi_core(a, output, output_tensor_start, output_tensor_end);
+        case Layout::TILE: return slice_tile_multi_core(a, output, output_tensor_start, output_tensor_end);
+        default: TT_ASSERT(false, "Unsupported Layout");
+    }
+    return {};
+}
+
+}  // namespace ttnn::operations::data_movement::detail 
+
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
index 10821e644a1..e532e4e73f7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
@@ -3,546 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "optional"
-#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
-#include "slice_op.hpp"
-using namespace tt::constants;
+namespace ttnn::operations::data_movement::detail {
 
-namespace tt {
+operation::ProgramWithCallbacks slice_multi_core(const Tensor& a, Tensor& output, const tt::tt_metal::Shape& output_tensor_start, const tt::tt_metal::Shape& output_tensor_end); 
 
-namespace tt_metal {
+}  // namespace ttnn::operations::data_movement::detail 
 
-inline std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> get_slice_runtime_args_rm(
-    const Tensor& input_tensor,
-    Tensor& output_tensor,
-    const Shape& output_tensor_start,
-    uint32_t num_cores_total,
-    uint32_t num_cores,
-    uint32_t num_cores_y,
-    CoreRangeSet core_group_1,
-    CoreRangeSet core_group_2,
-    uint32_t num_sticks_per_core_group_1,
-    uint32_t num_sticks_per_core_group_2,
-    uint32_t max_read_size) {
-    tt_metal::Device* device = input_tensor.device();
-
-    auto input_buffer = input_tensor.buffer();
-    auto output_buffer = output_tensor.buffer();
-    auto input_shape = input_tensor.get_legacy_shape();
-    auto output_shape = output_tensor.get_legacy_shape();
-
-    uint32_t padded_row_size_bytes = input_shape[-1] * input_tensor.element_size();
-    uint32_t unpadded_row_size_bytes = output_shape[-1] * input_tensor.element_size();
-
-    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
-    std::vector<uint32_t> num_unpadded_sticks_per_dim(num_dims);
-    std::vector<uint32_t> num_padded_sticks_per_dim(num_dims);
-    std::vector<uint32_t> id_per_dim(num_dims);
-
-    std::vector<uint32_t> accumulated_total_per_dim(num_dims);
-
-    // TODO: Remove first element of these arrays and update kernel accordingly
-    // This currently just matches tile version where we iterate over the row as well
-    num_unpadded_sticks_per_dim[0] = 1;
-    num_padded_sticks_per_dim[0] = 0;
-    accumulated_total_per_dim[0] = 1;
-
-    for (int32_t i = 1; i < num_dims; i++) {
-        uint32_t num_unpadded_dim = output_shape[-(i + 1)];
-        uint32_t num_total_dim = input_shape[-(i + 1)];
-        uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1];
-        num_unpadded_sticks_per_dim[i] = num_unpadded_dim;
-        num_padded_sticks_per_dim[i] = num_padded_dim;
-        accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1];
-    }
-
-    uint32_t unpadded_row_size_bytes_offset = round_up(unpadded_row_size_bytes, TILE_WIDTH / 2);
-
-    vector<uint32_t> common_reader_kernel_args = {
-        input_tensor.buffer()->address() + output_tensor_start[-1] * output_tensor.element_size(),
-        padded_row_size_bytes,
-        unpadded_row_size_bytes,
-        unpadded_row_size_bytes_offset,
-        num_dims,
-        0,
-        0,
-        0,
-        0};
-    common_reader_kernel_args.insert(
-        common_reader_kernel_args.end(), num_unpadded_sticks_per_dim.begin(), num_unpadded_sticks_per_dim.end());
-    common_reader_kernel_args.insert(
-        common_reader_kernel_args.end(), num_padded_sticks_per_dim.begin(), num_padded_sticks_per_dim.end());
-
-    std::vector<std::pair<std::vector<uint32_t>, std::vector<uint32_t>>> ret_val(num_cores_total);
-
-    uint32_t start_offset = ttnn::operations::data_movement::get_rm_start_offset(input_tensor, ttnn::Shape(output_tensor_start));
-    for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) {
-        CoreCoord core = {i / num_cores_y, i % num_cores_y};
-        uint32_t num_sticks_per_core;
-        if (core_group_1.core_coord_in_core_ranges(core)) {
-            num_sticks_per_core = num_sticks_per_core_group_1;
-        } else if (core_group_2.core_coord_in_core_ranges(core)) {
-            num_sticks_per_core = num_sticks_per_core_group_2;
-        } else {
-            // no-op
-            num_sticks_per_core = 0;
-        }
-
-        // issue more reads before calling barrier
-        uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
-        if (num_sticks_per_core != 0) {
-            auto num_sticks_per_core_pad32 = num_sticks_per_core + (32 - num_sticks_per_core % 32) % 32;
-            num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core_pad32, unpadded_row_size_bytes_offset, max_read_size);
-            num_read_per_barrier = num_sticks_per_core_pad32 / num_sticks_per_core_read;
-        }
-
-        id_per_dim[0] = num_sticks_written % num_unpadded_sticks_per_dim[0];
-        uint32_t unpadded_written = num_sticks_written / num_unpadded_sticks_per_dim[0];
-        uint32_t start_id = id_per_dim[0] + start_offset;
-
-        for (uint32_t j = 1; j < num_dims; j++) {
-            id_per_dim[j] = unpadded_written % num_unpadded_sticks_per_dim[j];
-            unpadded_written = unpadded_written / num_unpadded_sticks_per_dim[j];
-            start_id += id_per_dim[j] * accumulated_total_per_dim[j - 1];
-        }
-        vector<uint32_t> reader_kernel_args = common_reader_kernel_args;
-        //
-        uint32_t addr_offset = 5;  // input buffer addr, padded_row_size_bytes, unpadded_row_size_bytes, num_dims
-        reader_kernel_args[addr_offset++] = start_id;
-        reader_kernel_args[addr_offset++] = num_sticks_per_core;
-        reader_kernel_args[addr_offset++] = num_sticks_per_core_read;
-        reader_kernel_args[addr_offset] = num_read_per_barrier;
-        reader_kernel_args.insert(reader_kernel_args.end(), id_per_dim.begin(), id_per_dim.end());
-
-        vector<uint32_t> writer_kernel_args = {
-            output_buffer->address(), unpadded_row_size_bytes, unpadded_row_size_bytes_offset, num_sticks_per_core, num_sticks_per_core_read, num_read_per_barrier, num_sticks_written, 0};
-        num_sticks_written += num_sticks_per_core;
-        ret_val[i] = {reader_kernel_args, writer_kernel_args};
-    }
-
-    return ret_val;
-}
-
-operation::ProgramWithCallbacks slice_rm_multi_core(
-    const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) {
-    const Shape output_shape = output.get_legacy_shape();
-
-    tt_metal::Program program = tt_metal::CreateProgram();
-
-    // This should allocate a DRAM buffer on the device
-    tt_metal::Device* device = a.device();
-
-    uint32_t num_unpadded_sticks = output.volume() / output.get_legacy_shape()[-1];
-
-    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
-    uint32_t num_cores_x = compute_with_storage_grid_size.x;
-    uint32_t num_cores_y = compute_with_storage_grid_size.y;
-
-    CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1});
-    uint32_t num_cores_total = num_cores_x * num_cores_y;
-    auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_per_core_group_1, num_sticks_per_core_group_2] =
-        split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks);
-
-    tt_metal::Buffer* src0_buffer = a.buffer();
-
-    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-
-    uint32_t padded_row_size_bytes = a.get_legacy_shape()[-1] * a.element_size();
-    uint32_t unpadded_row_size_bytes = output_shape[-1] * a.element_size();
-
-    tt_metal::Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> reader_compile_time_args_vec = {(std::uint32_t)src0_is_dram};
-    bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-
-    uint32_t src_stick_size = padded_row_size_bytes;
-    uint32_t dst_stick_size = unpadded_row_size_bytes;
-
-    uint32_t src0_cb_index = 0;
-    uint32_t max_read_size = 4096;
-    uint32_t cb_page_size = dst_is_dram ? round_up(unpadded_row_size_bytes, TILE_WIDTH) : round_up(unpadded_row_size_bytes, TILE_WIDTH / 2);
-    uint32_t num_input_pages = num_sticks_per_core_group_1 > num_sticks_per_core_group_2 ? num_sticks_per_core_group_1 : num_sticks_per_core_group_2;
-    uint32_t num_sticks_per_core_read = 0, num_read_per_barrier = 0;
-    if (num_input_pages != 0) {
-        auto num_sticks_per_core_pad32 = num_input_pages + (32 - num_input_pages % 32) % 32;
-        num_sticks_per_core_read = merge_num_sticks_to_read(num_sticks_per_core_pad32, cb_page_size, max_read_size);
-        num_read_per_barrier = num_sticks_per_core_pad32 / num_sticks_per_core_read;
-    }
-    tt_metal::CircularBufferConfig cb_src0_config =
-        tt_metal::CircularBufferConfig(num_read_per_barrier * 2 * cb_page_size, {{src0_cb_index, cb_data_format}})
-            .set_page_size(src0_cb_index, cb_page_size);
-    auto cb_src0 = tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
-
-
-    std::vector<uint32_t> writer_compile_time_args_vec = {(std::uint32_t)src0_cb_index, (std::uint32_t)dst_is_dram};
-
-    tt_metal::KernelHandle unary_reader_kernel_id = tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp",
-        total_cores,
-        tt_metal::ReaderDataMovementConfig(reader_compile_time_args_vec));
-
-    tt_metal::KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp",
-        total_cores,
-        tt_metal::WriterDataMovementConfig(writer_compile_time_args_vec));
-
-    auto all_runtime_args = get_slice_runtime_args_rm(
-        a,
-        output,
-        output_tensor_start,
-        num_cores_total,
-        num_cores,
-        num_cores_y,
-        core_group_1,
-        core_group_2,
-        num_sticks_per_core_group_1,
-        num_sticks_per_core_group_2,
-        max_read_size);
-
-    for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) {
-        CoreCoord core = {i / num_cores_y, i % num_cores_y};
-        tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first);
-
-        tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second);
-    }
-
-    auto override_runtime_args_callback =
-        [unary_reader_kernel_id, unary_writer_kernel_id, compute_with_storage_grid_size, max_read_size](
-            const void* operation,
-            const Program& program,
-            const std::vector<Tensor>& input_tensors,
-            const std::vector<std::optional<const Tensor>>&,
-            const std::vector<Tensor>& output_tensors) {
-            auto src_tensor = input_tensors.at(0);
-            auto dst_tensor = output_tensors.at(0);
-            uint32_t num_cores_x = compute_with_storage_grid_size.x;
-            uint32_t num_cores_y = compute_with_storage_grid_size.y;
-            uint32_t num_cores_total = num_cores_x * num_cores_y;
-            uint32_t num_unpadded_sticks = dst_tensor.volume() / dst_tensor.get_legacy_shape()[-1];
-            auto
-                [num_cores,
-                 all_cores,
-                 core_group_1,
-                 core_group_2,
-                 num_sticks_per_core_group_1,
-                 num_sticks_per_core_group_2] =
-                    split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks);
-
-            const auto tensor_start = static_cast<const ttnn::operations::data_movement::Slice *>(operation)->slice_start;
-            auto all_runtime_args = get_slice_runtime_args_rm(
-                src_tensor,
-                dst_tensor,
-                tensor_start,
-                num_cores_total,
-                num_cores,
-                num_cores_y,
-                core_group_1,
-                core_group_2,
-                num_sticks_per_core_group_1,
-                num_sticks_per_core_group_2,
-                max_read_size);
-
-            for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; i++) {
-                CoreCoord core = {i / num_cores_y, i % num_cores_y};
-
-                { SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); }
-
-                { SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); }
-            }
-        };
-
-    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
-}
-
-template <bool initialize_args>
-inline __attribute__((always_inline)) void set_slice_runtime_args_tile(
-    const Tensor& input_tensor,
-    const Tensor& output_tensor,
-    const Shape& output_tensor_start,
-    const uint32_t& num_cores_total,
-    const uint32_t& num_cores,
-    const std::vector<CoreCoord>& cores,
-    const uint32_t& num_cores_group_1,
-    const uint32_t& num_cores_group_2,
-    const uint32_t& num_tiles_per_core_group_1,
-    const uint32_t& num_tiles_per_core_group_2,
-    const Program& program,
-    const tt_metal::KernelHandle& unary_reader_kernel_id,
-    const tt_metal::KernelHandle& unary_writer_kernel_id,
-    std::vector<uint32_t>& accumulated_total_per_dim) {
-    const auto input_buffer = input_tensor.buffer();
-    const auto output_buffer = output_tensor.buffer();
-    const auto& input_shape = input_tensor.get_legacy_shape();
-    const auto& output_shape = output_tensor.get_legacy_shape();
-
-    std::uint32_t num_dims = static_cast<std::uint32_t>(input_shape.rank());
-
-    uint32_t num_unpadded_Xt = output_shape[-1] / TILE_WIDTH;
-    uint32_t num_total_Xt = input_shape[-1] / TILE_WIDTH;
-    uint32_t num_padded_Xt = num_total_Xt - num_unpadded_Xt;
-    uint32_t num_unpadded_Yt = output_shape[-2] / TILE_HEIGHT;
-    uint32_t num_total_Yt = input_shape[-2] / TILE_HEIGHT;
-    uint32_t num_padded_Yt = (num_total_Yt - num_unpadded_Yt) * num_total_Xt;
-
-    const auto set_common_reader_args = [&](
-        uint32_t* reader_common_args,
-        uint32_t* num_unpadded_tiles_per_dim,
-        uint32_t* num_padded_tiles_per_dim) __attribute__((always_inline)) {
-        reader_common_args[0] = input_buffer->address();
-        num_unpadded_tiles_per_dim[0] = num_unpadded_Xt;
-        num_unpadded_tiles_per_dim[1] = num_unpadded_Yt;
-        num_padded_tiles_per_dim[0] = num_padded_Xt;
-        num_padded_tiles_per_dim[1] = num_padded_Yt;
-        accumulated_total_per_dim[0] = num_total_Xt;
-        accumulated_total_per_dim[1] = num_total_Yt * num_total_Xt;
-        for (int32_t i = 2; i < num_dims; ++i) {
-            uint32_t num_unpadded_dim = output_shape[-(i + 1)];
-            uint32_t num_total_dim = input_shape[-(i + 1)];
-            uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1];
-            num_unpadded_tiles_per_dim[i] = num_unpadded_dim;
-            num_padded_tiles_per_dim[i] = num_padded_dim;
-            accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1];
-        }
-    };
-
-    const auto set_reader_rt_args = [&](
-        uint32_t* reader_rt_args,
-        const uint32_t* num_unpadded_tiles_per_dim,
-        const uint32_t* num_padded_tiles_per_dim,
-        const uint32_t& num_tiles_per_core,
-        const uint32_t& start_offset,
-        const uint32_t& num_tiles_written) __attribute__((always_inline)) {
-        reader_rt_args[2] = num_tiles_written % num_unpadded_tiles_per_dim[0];
-        uint32_t unpadded_written = num_tiles_written / num_unpadded_tiles_per_dim[0];
-        uint32_t start_id = reader_rt_args[2] + start_offset;
-        for (uint32_t j = 1; j < num_dims; ++j) {
-            reader_rt_args[2 + j] = unpadded_written % num_unpadded_tiles_per_dim[j];
-            unpadded_written = unpadded_written / num_unpadded_tiles_per_dim[j];
-            start_id += reader_rt_args[2 + j] * accumulated_total_per_dim[j - 1];
-        }
-        reader_rt_args[0] = start_id;
-        reader_rt_args[1] = num_tiles_per_core;
-    };
-
-    if constexpr (initialize_args) {
-        std::vector<uint32_t> reader_common_args(1 + num_dims * 2);
-        uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1;
-        uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims;
-        set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim);
-        SetCommonRuntimeArgs(program, unary_reader_kernel_id, reader_common_args);
-    }
-    auto& reader_common_args = GetCommonRuntimeArgs(program, unary_reader_kernel_id);
-    uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1;
-    uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims;
-    if constexpr (!initialize_args) {
-        set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim);
-    }
-
-    uint32_t start_offset = ttnn::operations::data_movement::get_tiled_start_offset(input_tensor, ttnn::Shape(output_tensor_start));
-
-    auto& reader_kernel_args_by_core = GetRuntimeArgs(program, unary_reader_kernel_id);
-    auto& writer_kernel_args_by_core = GetRuntimeArgs(program, unary_writer_kernel_id);
-    const uint32_t num_used_cores = num_cores_group_1 + num_cores_group_2;
-    for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; ++i) {
-        const CoreCoord& core = cores[i];
-        uint32_t num_tiles_per_core;
-        if (i < num_cores_group_1) {
-            num_tiles_per_core = num_tiles_per_core_group_1;
-        } else if (i < num_used_cores) {
-            num_tiles_per_core = num_tiles_per_core_group_2;
-        } else {
-            // no-op
-            if constexpr (initialize_args) {
-                std::vector<uint32_t> reader_kernel_args(2 + num_dims, 0);
-                std::vector<uint32_t> writer_kernel_args(3, 0);
-                tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
-                tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args);
-            } else {
-                auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y];
-                reader_kernel_args[1] = 0;
-                auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y];
-                writer_kernel_args[1] = 0;
-            }
-            continue;
-        }
-
-        if constexpr (initialize_args) {
-            std::vector<uint32_t> reader_kernel_args(2 + num_dims);
-            set_reader_rt_args(
-                reader_kernel_args.data(),
-                num_unpadded_tiles_per_dim,
-                num_padded_tiles_per_dim,
-                num_tiles_per_core,
-                start_offset,
-                num_tiles_written);
-            SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
-        } else {
-            auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y];
-            set_reader_rt_args(
-                reader_kernel_args.data(),
-                num_unpadded_tiles_per_dim,
-                num_padded_tiles_per_dim,
-                num_tiles_per_core,
-                start_offset,
-                num_tiles_written);
-        }
-
-        if constexpr (initialize_args) {
-            vector<uint32_t> writer_kernel_args = {output_buffer->address(), num_tiles_per_core, num_tiles_written};
-            tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args);
-        } else {
-            auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y];
-            writer_kernel_args[0] = output_buffer->address();
-            writer_kernel_args[1] = num_tiles_per_core;
-            writer_kernel_args[2] = num_tiles_written;
-        }
-        num_tiles_written += num_tiles_per_core;
-    }
-}
-
-operation::ProgramWithCallbacks slice_tile_multi_core(
-    const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) {
-    const Shape output_shape = output.get_legacy_shape();
-
-    tt_metal::Program program = tt_metal::CreateProgram();
-
-    // This should allocate a DRAM buffer on the device
-    tt_metal::Device* device = a.device();
-
-    uint32_t num_unpadded_tiles = output.volume() / TILE_HW;
-
-    auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
-    uint32_t num_cores_x = compute_with_storage_grid_size.x;
-    uint32_t num_cores_y = compute_with_storage_grid_size.y;
-    auto num_cores_total = num_cores_x * num_cores_y;
-    CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1});
-
-    auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
-        split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles);
-
-    tt_metal::Buffer* src0_buffer = a.buffer();
-
-    tt_metal::Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
-
-    uint32_t src0_cb_index = 0;
-    uint32_t num_input_tiles = 2;
-    tt_metal::CircularBufferConfig cb_src0_config =
-        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
-            .set_page_size(src0_cb_index, single_tile_size);
-    auto cb_src0 = tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
-
-    std::uint32_t num_dims = static_cast<std::uint32_t>(a.get_legacy_shape().rank());
-
-    // Reader compile-time args
-    // Data is 32 byte aligned
-    bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-    bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> reader_compile_time_args = {
-        static_cast<uint32_t>(src0_cb_index),
-        static_cast<uint32_t>(num_dims),
-        static_cast<uint32_t>(src0_is_dram),
-    };
-    std::vector<uint32_t> writer_compile_time_args = {
-        static_cast<uint32_t>(src0_cb_index), static_cast<uint32_t>(dst_is_dram)};
-
-    // Tilized reader
-    tt_metal::KernelHandle unary_reader_kernel_id = tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/reader_unary_unpad_dims_interleaved_start_id.cpp",
-        total_cores,
-        tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
-
-    tt_metal::KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
-        total_cores,
-        tt_metal::WriterDataMovementConfig(writer_compile_time_args));
-
-    const auto cores = grid_to_cores(num_cores_total, num_cores_x, num_cores_y, false);
-
-    std::vector<uint32_t> accumulated_total_per_dim(num_dims);
-    set_slice_runtime_args_tile<true>(
-        a,
-        output,
-        output_tensor_start,
-        num_cores_total,
-        num_cores,
-        cores,
-        core_group_1.num_cores(),
-        core_group_2.num_cores(),
-        num_tiles_per_core_group_1,
-        num_tiles_per_core_group_2,
-        program,
-        unary_reader_kernel_id,
-        unary_writer_kernel_id,
-        accumulated_total_per_dim);
-
-    auto override_runtime_args_callback = [unary_reader_kernel_id,
-                                           unary_writer_kernel_id,
-                                           compute_with_storage_grid_size,
-                                           cores,
-                                           accumulated_total_per_dim](
-                                              const void* operation,
-                                              const Program& program,
-                                              const std::vector<Tensor>& input_tensors,
-                                              const std::vector<std::optional<const Tensor>>&,
-                                              const std::vector<Tensor>& output_tensors) mutable {
-        const Tensor& src_tensor = input_tensors[0];
-        const Tensor& dst_tensor = output_tensors[0];
-        uint32_t num_unpadded_tiles = dst_tensor.volume() / TILE_HW;
-
-        uint32_t num_cores_x = compute_with_storage_grid_size.x;
-        uint32_t num_cores_y = compute_with_storage_grid_size.y;
-        uint32_t num_cores_total = cores.size();
-
-        auto
-            [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
-                split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles);
-
-        const auto& tensor_start = static_cast<const ttnn::operations::data_movement::Slice*>(operation)->slice_start;
-        set_slice_runtime_args_tile<false>(
-            src_tensor,
-            dst_tensor,
-            tensor_start,
-            num_cores_total,
-            num_cores,
-            cores,
-            core_group_1.num_cores(),
-            core_group_2.num_cores(),
-            num_tiles_per_core_group_1,
-            num_tiles_per_core_group_2,
-            program,
-            unary_reader_kernel_id,
-            unary_writer_kernel_id,
-            accumulated_total_per_dim);
-    };
-
-    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks slice_multi_core(
-    const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) {
-    switch (a.get_layout()) {
-        case Layout::ROW_MAJOR: return slice_rm_multi_core(a, output, output_tensor_start, output_tensor_end);
-        case Layout::TILE: return slice_tile_multi_core(a, output, output_tensor_start, output_tensor_end);
-        default: TT_ASSERT(false, "Unsupported Layout");
-    }
-    return {};
-}
-
-}  // namespace tt_metal
-
-}  // namespace tt
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
new file mode 100644
index 00000000000..4beb6402a2c
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
@@ -0,0 +1,411 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+#include <math.h>
+
+#include "ttnn/deprecated/tt_dnn/op_library/cb_utils.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
+#include "ttnn/operation.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/work_split_tilize.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+
+using namespace tt::constants;
+
+namespace ttnn::operations::data_movement::detail {
+
+operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& output) {
+    tt::tt_metal::Program program{};
+
+    CoreRange core({0, 0}, {0, 0});
+
+    tt::tt_metal::Buffer* src0_buffer = a.buffer();
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device* device = a.device();
+    auto output_shape = output.get_legacy_shape();
+
+    tt::tt_metal::Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    uint32_t num_tiles = a.volume() / TILE_HW;
+
+    auto width = a.get_legacy_shape()[-1];
+    uint32_t stick_s = width;
+    uint32_t num_sticks = a.volume() / width;
+    uint32_t stick_size = stick_s * a.element_size();  // Assuming bfloat16 dataformat
+
+    uint32_t num_tiles_in_row = stick_s / TILE_WIDTH;
+    // Ensure we don't intrude into storage space
+    uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - L1_UNRESERVED_BASE;
+    uint32_t max_tiles = max_l1_size / (input_single_tile_size + output_single_tile_size);  // 2 CBs
+    // Currently need the number of tiles in a row to be divisible by tiles in a block
+    uint32_t num_tiles_per_block = 1;
+    if (num_tiles_in_row <= max_tiles) {
+        num_tiles_per_block = num_tiles_in_row;
+    } else {
+        for (uint32_t n_t = max_tiles; n_t > 0; n_t--) {
+            if (num_tiles_in_row % n_t == 0) {
+                num_tiles_per_block = n_t;
+                break;
+            }
+        }
+    }
+    uint32_t block_width_size = num_tiles_per_block * TILE_WIDTH * a.element_size();
+    uint32_t num_full_blocks_in_row = num_tiles_in_row / num_tiles_per_block;
+    uint32_t num_leftover_tiles = num_tiles_in_row % num_tiles_per_block;
+    uint32_t leftover_width_in_row = num_leftover_tiles * a.element_size();
+
+    uint32_t src0_cb_index = 0;
+    uint32_t num_input_tiles = num_tiles_per_block;
+
+    auto src0_cb_config = tt::tt_metal::CircularBufferConfig(
+                              num_input_tiles * input_single_tile_size, {{src0_cb_index, input_cb_data_format}})
+                              .set_page_size(src0_cb_index, input_single_tile_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
+
+    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t num_output_tiles = num_tiles_per_block;
+    auto cb_output_config = tt::tt_metal::CircularBufferConfig(
+                                num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
+                                .set_page_size(output_cb_index, output_single_tile_size);
+    auto cb_output = tt::tt_metal::CreateCircularBuffer(program, core, cb_output_config);
+
+    vector<uint32_t> reader_kernel_args = {
+        src0_buffer->address(),
+        num_sticks,
+        stick_size,
+        num_tiles_per_block,
+        block_width_size,
+        num_full_blocks_in_row,
+        num_leftover_tiles,
+        leftover_width_in_row,
+        0  // row_start_id
+    };
+
+    // Reader compile-time args
+    uint32_t src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
+    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)log2(stick_size) : 0;
+    std::vector<uint32_t> reader_compile_time_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
+
+    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> writer_compile_time_args = {output_cb_index, out_is_dram};
+
+    // Tilized reader
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp",
+        core,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    // Tilized writer
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
+        core,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    vector<uint32_t> compute_args = {
+        num_tiles / num_tiles_per_block,  // per_core_block_cnt
+        num_tiles_per_block               // per_core_block_tile_cnt
+    };
+
+    auto tilize_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+        core,
+        tt::tt_metal::ComputeConfig{.compile_args = compute_args});
+
+    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
+
+    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_buffer->address(), num_tiles, 0});
+
+    auto override_runtime_args_callback = [reader_kernel_id = unary_reader_kernel_id,
+                                           writer_kernel_id = unary_writer_kernel_id](
+                                              const Program& program,
+                                              const std::vector<Buffer*>& input_buffers,
+                                              const std::vector<Buffer*>& output_buffers) {
+        auto src_buffer = input_buffers.at(0);
+
+        auto dst_buffer = output_buffers.at(0);
+
+        CoreCoord core = {0, 0};
+
+        {
+            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+        }
+
+        {
+            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+            runtime_args[0] = dst_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks tilize_multi_core_interleaved(const Tensor& a, Tensor& output) {
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    tt::DataFormat input_cb_data_format = datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+    tt::DataFormat output_cb_data_format = datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    int32_t ntiles = a.volume() / TILE_HW;
+    uint32_t ntiles_per_block = a.get_legacy_shape()[-1] / TILE_WIDTH;
+    uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block);
+    uint32_t block_size_nbytes = a.get_legacy_shape()[-1] * a.element_size();
+
+    Device* device = a.device();
+    auto grid_size = device->compute_with_storage_grid_size();
+    auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
+        split_blocks_for_tilize(grid_size, nblocks);
+
+    create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_block, input_cb_data_format);
+
+    auto [output_cb_index, _] =
+        create_cb(tt::CB::c_out0, program, all_cores, output_single_tile_size, ntiles_per_block, output_cb_data_format);
+
+    Buffer* src0_buffer = a.buffer();
+    Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    /** reader
+     */
+    uint32_t src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(block_size_nbytes);
+    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)std::log2(block_size_nbytes) : 0;
+    std::vector<uint32_t> reader_ct_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
+    KernelHandle unary_reader_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp",
+        all_cores,
+        ReaderDataMovementConfig(reader_ct_args));
+
+    /** writer
+     */
+    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> writer_ct_args = {output_cb_index, out_is_dram};
+    KernelHandle unary_writer_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
+        all_cores,
+        WriterDataMovementConfig(writer_ct_args));
+
+    /** compute
+     */
+    vector<uint32_t> compute_args = {nblocks_per_core, ntiles_per_block};
+    vector<uint32_t> compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block};
+
+    if (core_range.ranges().size() > 0) {
+        auto tilize_kernel_id = CreateKernel(
+            program,
+            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+            core_range,
+            ComputeConfig{.compile_args = compute_args});
+    }
+    if (core_range_cliff.size() > 0) {
+        auto tilize_cliff_kernel_id = CreateKernel(
+            program,
+            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+            core_range_cliff,
+            ComputeConfig{.compile_args = compute_args_cliff});
+    }
+
+    // 1D distribution of blocks across cores
+    bool has_cliff = core_range_cliff.size() > 0;
+
+    uint32_t ncores_full = ncores - has_cliff;
+    uint32_t ncores_x = grid_size.x;
+    uint32_t tile_start_id = 0;
+    uint32_t row_start_id = 0;
+    const auto& cores = grid_to_cores(ncores, grid_size.x, grid_size.y, true);
+    for (uint32_t i = 0; i < ncores_full; ++i) {
+        const CoreCoord& core = cores[i];
+
+        // reader runtime args
+        vector<uint32_t> reader_rt_args = {
+            src0_buffer->address(),
+            nblocks_per_core * TILE_HEIGHT,
+            block_size_nbytes,
+            ntiles_per_block,
+            block_size_nbytes,
+            1,  // full blocks in row
+            0,  // num leftover tiles
+            0,  // leftover width in row
+            row_start_id};
+
+        // writer runtime args
+        vector<uint32_t> writer_rt_args = {
+            dst_buffer->address(),
+            ntiles_per_block * nblocks_per_core,  // ntiles per core
+            tile_start_id                         // start id
+        };
+
+        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
+        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
+
+        tile_start_id += ntiles_per_block * nblocks_per_core;
+        row_start_id += TILE_HEIGHT * nblocks_per_core;
+    }
+    if (has_cliff) {
+        // the last core is a cliff core with nblocks_per_core_cliff blocks
+        const CoreCoord& core = cores.back();
+
+        // reader runtime args
+        vector<uint32_t> reader_rt_args = {
+            src0_buffer->address(),
+            nblocks_per_core_cliff * TILE_HEIGHT,
+            block_size_nbytes,
+            ntiles_per_block,
+            block_size_nbytes,
+            1,  // full blocks in row
+            0,  // num leftover tiles
+            0,  // leftover width in row
+            row_start_id};
+
+        // writer runtime args
+        vector<uint32_t> writer_rt_args = {
+            dst_buffer->address(),
+            ntiles_per_block * nblocks_per_core_cliff,  // ntiles per core
+            tile_start_id                               // start id
+        };
+
+        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
+        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
+    }
+
+    auto override_runtime_args_callback =
+        [reader_kernel_id = unary_reader_kernel_id, writer_kernel_id = unary_writer_kernel_id, cores = cores](
+            const Program& program,
+            const std::vector<Buffer*>& input_buffers,
+            const std::vector<Buffer*>& output_buffers) {
+            auto src_buffer = input_buffers.at(0);
+            auto dst_buffer = output_buffers.at(0);
+
+            auto& reader_runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id);
+            auto& writer_runtime_args_by_core = GetRuntimeArgs(program, writer_kernel_id);
+            for (const auto& core : cores) {
+                {
+                    auto& runtime_args = reader_runtime_args_by_core[core.x][core.y];
+                    runtime_args[0] = src_buffer->address();
+                }
+                {
+                    auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
+                    runtime_args[0] = dst_buffer->address();
+                }
+            }
+        };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, Tensor& output) {
+    tt::tt_metal::Program program{};
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    uint32_t num_tiles = input.volume() / TILE_HW;
+
+    tt::tt_metal::Device* device = input.device();
+
+    auto shard_spec = input.shard_spec().value();
+    uint32_t num_tiles_per_shard = shard_spec.shape[0] * shard_spec.shape[1] / TILE_HW;
+    uint32_t num_tiles_per_row = shard_spec.shape[1] / TILE_WIDTH;
+    auto all_cores = shard_spec.grid;
+    uint32_t num_cores_x = device->compute_with_storage_grid_size().x;
+    uint32_t num_cores = all_cores.num_cores();
+
+    auto [src0_cb_index, cb_src0] = create_cb(
+        tt::CB::c_in0,
+        program,
+        all_cores,
+        input_single_tile_size,
+        num_tiles_per_shard,
+        input_cb_data_format,
+        input.buffer());
+
+    auto [output_cb_index, cb_output] = create_cb(
+        tt::CB::c_out0,
+        program,
+        all_cores,
+        output_single_tile_size,
+        num_tiles_per_shard,
+        output_cb_data_format,
+        output.buffer());
+
+    auto src_buffer = input.buffer();
+
+    auto dst_buffer = output.buffer();
+
+    std::vector<uint32_t> reader_compile_time_args = {(std::uint32_t)src0_cb_index};
+
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> writer_compile_time_args = {(std::uint32_t)output_cb_index};
+
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_sharded.cpp",
+        all_cores,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/kernels/dataflow/writer_unary_sharded.cpp",
+        all_cores,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    vector<uint32_t> compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)};
+
+    auto untilize_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+        all_cores,
+        tt::tt_metal::ComputeConfig{.compile_args = compute_args});
+
+    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, all_cores, {num_tiles_per_shard});
+
+    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, all_cores, {num_tiles_per_shard});
+
+    auto override_runtime_arguments_callback =
+        [unary_reader_kernel_id, unary_writer_kernel_id, cb_src0, cb_output](
+            const void* operation,
+            Program& program,
+            const std::vector<Tensor>& input_tensors,
+            const std::vector<std::optional<const Tensor>>& optional_input_tensors,
+            const std::vector<Tensor>& output_tensors) {
+            auto src_buffer = input_tensors.at(0).buffer();
+
+            auto dst_buffer = output_tensors.at(0).buffer();
+
+            UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer);
+
+            UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer);
+        };
+    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback};
+}
+
+operation::ProgramWithCallbacks tilize_multi_core(const Tensor& a, Tensor& output) {
+    if (a.memory_config().is_sharded()) {
+        return tilize_multi_core_sharded(a, output);
+    } else {
+        return tilize_multi_core_interleaved(a, output);
+    }
+}
+
+}  // namespace ttnn::operations::data_movement::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
index 3388de10a99..1e061f399a0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
@@ -4,409 +4,13 @@
 
 #pragma once
 
-#include <math.h>
-
-#include "ttnn/deprecated/tt_dnn/op_library/cb_utils.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
-#include "ttnn/operation.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
-using namespace tt::constants;
 
 namespace ttnn::operations::data_movement::detail {
 
-operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& output) {
-    tt::tt_metal::Program program{};
-
-    CoreRange core({0, 0}, {0, 0});
-
-    tt::tt_metal::Buffer* src0_buffer = a.buffer();
-
-    // This should allocate a DRAM buffer on the device
-    tt::tt_metal::Device* device = a.device();
-    auto output_shape = output.get_legacy_shape();
-
-    tt::tt_metal::Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    uint32_t num_tiles = a.volume() / TILE_HW;
-
-    auto width = a.get_legacy_shape()[-1];
-    uint32_t stick_s = width;
-    uint32_t num_sticks = a.volume() / width;
-    uint32_t stick_size = stick_s * a.element_size();  // Assuming bfloat16 dataformat
-
-    uint32_t num_tiles_in_row = stick_s / TILE_WIDTH;
-    // Ensure we don't intrude into storage space
-    uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - L1_UNRESERVED_BASE;
-    uint32_t max_tiles = max_l1_size / (input_single_tile_size + output_single_tile_size);  // 2 CBs
-    // Currently need the number of tiles in a row to be divisible by tiles in a block
-    uint32_t num_tiles_per_block = 1;
-    if (num_tiles_in_row <= max_tiles) {
-        num_tiles_per_block = num_tiles_in_row;
-    } else {
-        for (uint32_t n_t = max_tiles; n_t > 0; n_t--) {
-            if (num_tiles_in_row % n_t == 0) {
-                num_tiles_per_block = n_t;
-                break;
-            }
-        }
-    }
-    uint32_t block_width_size = num_tiles_per_block * TILE_WIDTH * a.element_size();
-    uint32_t num_full_blocks_in_row = num_tiles_in_row / num_tiles_per_block;
-    uint32_t num_leftover_tiles = num_tiles_in_row % num_tiles_per_block;
-    uint32_t leftover_width_in_row = num_leftover_tiles * a.element_size();
-
-    uint32_t src0_cb_index = 0;
-    uint32_t num_input_tiles = num_tiles_per_block;
-
-    auto src0_cb_config = tt::tt_metal::CircularBufferConfig(
-                              num_input_tiles * input_single_tile_size, {{src0_cb_index, input_cb_data_format}})
-                              .set_page_size(src0_cb_index, input_single_tile_size);
-    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
-
-    uint32_t output_cb_index = 16;  // output operands start at index 16
-    uint32_t num_output_tiles = num_tiles_per_block;
-    auto cb_output_config = tt::tt_metal::CircularBufferConfig(
-                                num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
-                                .set_page_size(output_cb_index, output_single_tile_size);
-    auto cb_output = tt::tt_metal::CreateCircularBuffer(program, core, cb_output_config);
-
-    vector<uint32_t> reader_kernel_args = {
-        src0_buffer->address(),
-        num_sticks,
-        stick_size,
-        num_tiles_per_block,
-        block_width_size,
-        num_full_blocks_in_row,
-        num_leftover_tiles,
-        leftover_width_in_row,
-        0  // row_start_id
-    };
-
-    // Reader compile-time args
-    uint32_t src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
-    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)log2(stick_size) : 0;
-    std::vector<uint32_t> reader_compile_time_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
-
-    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> writer_compile_time_args = {output_cb_index, out_is_dram};
-
-    // Tilized reader
-    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp",
-        core,
-        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
-
-    // Tilized writer
-    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
-        core,
-        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
-
-    vector<uint32_t> compute_args = {
-        num_tiles / num_tiles_per_block,  // per_core_block_cnt
-        num_tiles_per_block               // per_core_block_tile_cnt
-    };
-
-    auto tilize_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-        core,
-        tt::tt_metal::ComputeConfig{.compile_args = compute_args});
-
-    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
-
-    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_buffer->address(), num_tiles, 0});
-
-    auto override_runtime_args_callback = [reader_kernel_id = unary_reader_kernel_id,
-                                           writer_kernel_id = unary_writer_kernel_id](
-                                              const Program& program,
-                                              const std::vector<Buffer*>& input_buffers,
-                                              const std::vector<Buffer*>& output_buffers) {
-        auto src_buffer = input_buffers.at(0);
-
-        auto dst_buffer = output_buffers.at(0);
-
-        CoreCoord core = {0, 0};
-
-        {
-            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-        }
-
-        {
-            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
-            runtime_args[0] = dst_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks tilize_multi_core_interleaved(const Tensor& a, Tensor& output) {
-    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    tt::DataFormat input_cb_data_format = datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-    tt::DataFormat output_cb_data_format = datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    int32_t ntiles = a.volume() / TILE_HW;
-    uint32_t ntiles_per_block = a.get_legacy_shape()[-1] / TILE_WIDTH;
-    uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block);
-    uint32_t block_size_nbytes = a.get_legacy_shape()[-1] * a.element_size();
-
-    Device* device = a.device();
-    auto grid_size = device->compute_with_storage_grid_size();
-    auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
-        split_blocks_for_tilize(grid_size, nblocks);
-
-    create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_block, input_cb_data_format);
-
-    auto [output_cb_index, _] =
-        create_cb(tt::CB::c_out0, program, all_cores, output_single_tile_size, ntiles_per_block, output_cb_data_format);
-
-    Buffer* src0_buffer = a.buffer();
-    Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    /** reader
-     */
-    uint32_t src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(block_size_nbytes);
-    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)std::log2(block_size_nbytes) : 0;
-    std::vector<uint32_t> reader_ct_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
-    KernelHandle unary_reader_kernel_id = CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp",
-        all_cores,
-        ReaderDataMovementConfig(reader_ct_args));
-
-    /** writer
-     */
-    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> writer_ct_args = {output_cb_index, out_is_dram};
-    KernelHandle unary_writer_kernel_id = CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
-        all_cores,
-        WriterDataMovementConfig(writer_ct_args));
-
-    /** compute
-     */
-    vector<uint32_t> compute_args = {nblocks_per_core, ntiles_per_block};
-    vector<uint32_t> compute_args_cliff = {nblocks_per_core_cliff, ntiles_per_block};
-
-    if (core_range.ranges().size() > 0) {
-        auto tilize_kernel_id = CreateKernel(
-            program,
-            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-            core_range,
-            ComputeConfig{.compile_args = compute_args});
-    }
-    if (core_range_cliff.size() > 0) {
-        auto tilize_cliff_kernel_id = CreateKernel(
-            program,
-            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-            core_range_cliff,
-            ComputeConfig{.compile_args = compute_args_cliff});
-    }
-
-    // 1D distribution of blocks across cores
-    bool has_cliff = core_range_cliff.size() > 0;
-
-    uint32_t ncores_full = ncores - has_cliff;
-    uint32_t ncores_x = grid_size.x;
-    uint32_t tile_start_id = 0;
-    uint32_t row_start_id = 0;
-    const auto& cores = grid_to_cores(ncores, grid_size.x, grid_size.y, true);
-    for (uint32_t i = 0; i < ncores_full; ++i) {
-        const CoreCoord& core = cores[i];
-
-        // reader runtime args
-        vector<uint32_t> reader_rt_args = {
-            src0_buffer->address(),
-            nblocks_per_core * TILE_HEIGHT,
-            block_size_nbytes,
-            ntiles_per_block,
-            block_size_nbytes,
-            1,  // full blocks in row
-            0,  // num leftover tiles
-            0,  // leftover width in row
-            row_start_id};
-
-        // writer runtime args
-        vector<uint32_t> writer_rt_args = {
-            dst_buffer->address(),
-            ntiles_per_block * nblocks_per_core,  // ntiles per core
-            tile_start_id                         // start id
-        };
-
-        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
-        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
-
-        tile_start_id += ntiles_per_block * nblocks_per_core;
-        row_start_id += TILE_HEIGHT * nblocks_per_core;
-    }
-    if (has_cliff) {
-        // the last core is a cliff core with nblocks_per_core_cliff blocks
-        const CoreCoord& core = cores.back();
-
-        // reader runtime args
-        vector<uint32_t> reader_rt_args = {
-            src0_buffer->address(),
-            nblocks_per_core_cliff * TILE_HEIGHT,
-            block_size_nbytes,
-            ntiles_per_block,
-            block_size_nbytes,
-            1,  // full blocks in row
-            0,  // num leftover tiles
-            0,  // leftover width in row
-            row_start_id};
-
-        // writer runtime args
-        vector<uint32_t> writer_rt_args = {
-            dst_buffer->address(),
-            ntiles_per_block * nblocks_per_core_cliff,  // ntiles per core
-            tile_start_id                               // start id
-        };
-
-        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
-        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
-    }
-
-    auto override_runtime_args_callback =
-        [reader_kernel_id = unary_reader_kernel_id, writer_kernel_id = unary_writer_kernel_id, cores = cores](
-            const Program& program,
-            const std::vector<Buffer*>& input_buffers,
-            const std::vector<Buffer*>& output_buffers) {
-            auto src_buffer = input_buffers.at(0);
-            auto dst_buffer = output_buffers.at(0);
-
-            auto& reader_runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id);
-            auto& writer_runtime_args_by_core = GetRuntimeArgs(program, writer_kernel_id);
-            for (const auto& core : cores) {
-                {
-                    auto& runtime_args = reader_runtime_args_by_core[core.x][core.y];
-                    runtime_args[0] = src_buffer->address();
-                }
-                {
-                    auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
-                    runtime_args[0] = dst_buffer->address();
-                }
-            }
-        };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, Tensor& output) {
-    tt::tt_metal::Program program{};
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    uint32_t num_tiles = input.volume() / TILE_HW;
-
-    tt::tt_metal::Device* device = input.device();
-
-    auto shard_spec = input.shard_spec().value();
-    uint32_t num_tiles_per_shard = shard_spec.shape[0] * shard_spec.shape[1] / TILE_HW;
-    uint32_t num_tiles_per_row = shard_spec.shape[1] / TILE_WIDTH;
-    auto all_cores = shard_spec.grid;
-    uint32_t num_cores_x = device->compute_with_storage_grid_size().x;
-    uint32_t num_cores = all_cores.num_cores();
-
-    auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in0,
-        program,
-        all_cores,
-        input_single_tile_size,
-        num_tiles_per_shard,
-        input_cb_data_format,
-        input.buffer());
-
-    auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
-        program,
-        all_cores,
-        output_single_tile_size,
-        num_tiles_per_shard,
-        output_cb_data_format,
-        output.buffer());
-
-    auto src_buffer = input.buffer();
-
-    auto dst_buffer = output.buffer();
-
-    std::vector<uint32_t> reader_compile_time_args = {(std::uint32_t)src0_cb_index};
-
-    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> writer_compile_time_args = {(std::uint32_t)output_cb_index};
-
-    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_sharded.cpp",
-        all_cores,
-        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
-
-    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/kernels/dataflow/writer_unary_sharded.cpp",
-        all_cores,
-        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
-
-    vector<uint32_t> compute_args = {uint32_t(num_tiles_per_shard / num_tiles_per_row), uint32_t(num_tiles_per_row)};
-
-    auto untilize_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-        all_cores,
-        tt::tt_metal::ComputeConfig{.compile_args = compute_args});
-
-    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, all_cores, {num_tiles_per_shard});
-
-    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, all_cores, {num_tiles_per_shard});
-
-    auto override_runtime_arguments_callback =
-        [unary_reader_kernel_id, unary_writer_kernel_id, cb_src0, cb_output](
-            const void* operation,
-            Program& program,
-            const std::vector<Tensor>& input_tensors,
-            const std::vector<std::optional<const Tensor>>& optional_input_tensors,
-            const std::vector<Tensor>& output_tensors) {
-            auto src_buffer = input_tensors.at(0).buffer();
-
-            auto dst_buffer = output_tensors.at(0).buffer();
-
-            UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer);
-
-            UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer);
-        };
-    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback};
-}
+operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& output);
+operation::ProgramWithCallbacks tilize_multi_core(const Tensor& a, Tensor& output);
 
-operation::ProgramWithCallbacks tilize_multi_core(const Tensor& a, Tensor& output) {
-    if (a.memory_config().is_sharded()) {
-        return tilize_multi_core_sharded(a, output);
-    } else {
-        return tilize_multi_core_interleaved(a, output);
-    }
-}
 
 }  // namespace ttnn::operations::data_movement::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
new file mode 100644
index 00000000000..c4535254a86
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
@@ -0,0 +1,491 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+#include <math.h>
+
+#include "ttnn/deprecated/tt_dnn/op_library/cb_utils.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
+#include "ttnn/operation.hpp"
+#include "ttnn/deprecated/tt_dnn/op_library/work_split_tilize.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+
+using namespace tt::constants;
+
+namespace ttnn::operations::data_movement::detail {
+
+operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
+    const Tensor& a, Tensor& output, const float pad_value) {
+    auto output_shape = output.get_legacy_shape();
+
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    CoreRange core({0, 0}, {0, 0});
+
+    // This should allocate a DRAM buffer on the device
+    tt::tt_metal::Device* device = a.device();
+
+    tt::tt_metal::Buffer* src0_buffer = a.buffer();
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    int32_t num_tiles = output.volume() / TILE_HW;
+
+    auto true_input_shape = a.get_legacy_shape();
+    auto true_output_shape = output.get_legacy_shape();
+
+    auto input_w = true_input_shape.rank() >= 4 ? true_input_shape[-4] : 1;
+    auto input_z = true_input_shape.rank() >= 3 ? true_input_shape[-3] : 1;
+    auto input_y = true_input_shape.rank() >= 2 ? true_input_shape[-2] : 1;
+    auto input_x = true_input_shape[-1];
+
+    auto output_w = true_output_shape.rank() >= 4 ? true_output_shape[-4] : 1;
+    auto output_z = true_output_shape.rank() >= 3 ? true_output_shape[-3] : 1;
+    auto output_y = true_output_shape.rank() >= 2 ? true_output_shape[-2] : 1;
+    auto output_x = true_output_shape[-1];
+
+    uint32_t unpadded_row_size_bytes = input_x * a.element_size();  // Assuming bfloat16 dataformat
+    uint32_t padded_row_size_bytes = output_x * a.element_size();   // Assuming bfloat16 dataformat
+
+    constexpr uint32_t alignment = 32;
+
+    uint32_t num_tiles_in_row = output_x / TILE_WIDTH;
+    // Ensure we don't intrude into storage space
+    uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - L1_UNRESERVED_BASE;
+    // Memory usage is 2 CBs of width W, plus buffer of size alignment + (W * datum size)
+    uint32_t max_X = (max_l1_size - alignment) / (a.element_size() * TILE_HEIGHT * 2 + a.element_size());
+    uint32_t max_tiles = max_X / TILE_WIDTH;
+
+    // Currently need the number of tiles in a row to be divisible by tiles in a block
+    uint32_t num_tiles_per_block = 1;
+    if (num_tiles_in_row <= max_tiles) {
+        num_tiles_per_block = num_tiles_in_row;
+    } else {
+        for (uint32_t n_t = max_tiles; n_t > 0; n_t--) {
+            if (num_tiles_in_row % n_t == 0) {
+                num_tiles_per_block = n_t;
+                break;
+            }
+        }
+    }
+
+    uint32_t block_width = num_tiles_per_block * TILE_WIDTH;
+    uint32_t block_row_size = block_width * a.element_size();
+    uint32_t num_blocks_w_output = padded_row_size_bytes / block_row_size;
+    uint32_t num_blocks_w_input = unpadded_row_size_bytes / block_row_size;
+
+    // Leftover size if input is not divisible by block size
+    uint32_t block_row_leftover_size = unpadded_row_size_bytes - num_blocks_w_input * block_row_size;
+
+    // Number of blocks that differ between input and output
+    const uint32_t num_blocks_w_diff = num_blocks_w_output - num_blocks_w_input - (block_row_leftover_size > 0 ? 1 : 0);
+
+    const uint32_t padded_Y_diff_blocks = (output_y - input_y) / TILE_HEIGHT * num_blocks_w_output;
+    const uint32_t padded_Z_diff_blocks = (output_z - input_z) * output_y / TILE_HEIGHT * num_blocks_w_output;
+    const uint32_t padded_W_diff_blocks =
+        (output_w - input_w) * output_z * output_y / TILE_HEIGHT * num_blocks_w_output;
+    const uint32_t num_leftover_Y = input_y - input_y / TILE_HEIGHT * TILE_HEIGHT;
+
+    tt::tt_metal::Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    uint32_t src0_cb_index = 0;
+    uint32_t num_input_tiles = num_tiles_per_block;
+    assert(num_input_tiles > 0);
+    tt::tt_metal::CircularBufferConfig src0_cb_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_input_tiles * input_single_tile_size, {{src0_cb_index, input_cb_data_format}})
+            .set_page_size(src0_cb_index, input_single_tile_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
+
+    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t num_output_tiles = num_tiles_per_block;
+    tt::tt_metal::CircularBufferConfig cb_output_config =
+        tt::tt_metal::CircularBufferConfig(
+            num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
+            .set_page_size(output_cb_index, output_single_tile_size);
+    auto cb_output = tt::tt_metal::CreateCircularBuffer(program, core, cb_output_config);
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    vector<uint32_t> reader_kernel_args = {
+        src0_buffer->address(),
+        input_w,
+        padded_W_diff_blocks,
+        input_z,
+        padded_Z_diff_blocks,
+        input_y,
+        padded_Y_diff_blocks,
+        num_leftover_Y,
+        input_x,
+        unpadded_row_size_bytes,
+        padded_row_size_bytes,
+        packed_pad_value,
+        num_blocks_w_input,
+        num_blocks_w_output,
+        num_blocks_w_diff,
+        block_row_size,
+        block_row_leftover_size};
+
+    // Reader compile-time args
+    uint32_t src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    uint32_t stick_size = unpadded_row_size_bytes;
+    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
+    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)log2(stick_size) : 0;
+    std::vector<uint32_t> reader_compile_time_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
+
+    // Tilized reader
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_dims_split_rows.cpp",
+        core,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    // Tilized writer
+    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
+        core,
+        tt::tt_metal::WriterDataMovementConfig({output_cb_index, out_is_dram}));
+
+    vector<uint32_t> compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
+
+    auto tilize_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+        core,
+        tt::tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
+
+    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
+
+    tt::tt_metal::SetRuntimeArgs(
+        program, unary_writer_kernel_id, core, {dst_buffer->address(), (uint32_t)num_tiles, 0});
+
+    auto override_runtime_args_callback = [reader_kernel_id = unary_reader_kernel_id,
+                                           writer_kernel_id = unary_writer_kernel_id](
+                                              const Program& program,
+                                              const std::vector<Buffer*>& input_buffers,
+                                              const std::vector<Buffer*>& output_buffers) {
+        auto src_buffer = input_buffers.at(0);
+
+        auto dst_buffer = output_buffers.at(0);
+
+        CoreCoord core = {0, 0};
+
+        {
+            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+            runtime_args[0] = src_buffer->address();
+        }
+
+        {
+            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+            runtime_args[0] = dst_buffer->address();
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_interleaved(
+    const Tensor& a, Tensor& output, const float pad_value) {
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    Device* device = a.device();
+    CoreCoord grid_size = device->compute_with_storage_grid_size();
+
+    uint32_t num_blocks = output.volume() / output.get_legacy_shape()[-1] / TILE_HEIGHT;
+    uint32_t num_tiles_per_row = output.get_legacy_shape()[-1] / TILE_WIDTH;
+
+    auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
+        split_blocks_for_tilize(grid_size, num_blocks);
+
+    bool has_cliff = core_range_cliff.size() > 0;
+
+    uint32_t unpadded_row_size_bytes = a.get_legacy_shape()[-1] * a.element_size();     // Assuming bfloat16 dataformat
+    uint32_t padded_row_size_bytes = output.get_legacy_shape()[-1] * a.element_size();  // Assuming bfloat16 dataformat
+
+    auto [src0_cb_index, cb_src0] =
+        create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
+
+    auto [output_cb_index, cb_output] = create_cb(
+        tt::CB::c_out0, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
+
+    Buffer* src0_buffer = a.buffer();
+    Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    /** reader
+     */
+    uint32_t src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    uint32_t stick_size = unpadded_row_size_bytes;
+    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
+    uint32_t log2_stick_size = stick_size_is_power_of_two ? (std::uint32_t)std::log2(stick_size) : 0;
+
+    KernelHandle unary_reader_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_dims_split_rows_multicore.cpp",
+        all_cores,
+        ReaderDataMovementConfig({src0_is_dram, stick_size_is_power_of_two, log2_stick_size}));
+
+    /** writer
+     */
+    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+
+    KernelHandle unary_writer_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
+        all_cores,
+        WriterDataMovementConfig({output_cb_index, out_is_dram}));
+
+    /** compute
+     */
+    if (core_range.size() > 0) {
+        auto tilize_kernel_id = CreateKernel(
+            program,
+            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+            core_range,
+            ComputeConfig{.compile_args = {nblocks_per_core, num_tiles_per_row}});
+    }
+    if (has_cliff) {
+        auto tilize_cliff_kernel_id = CreateKernel(
+            program,
+            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
+            core_range_cliff,
+            ComputeConfig{.compile_args = {nblocks_per_core_cliff, num_tiles_per_row}});
+    }
+
+    /* RUNTIME ARGS */
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    // 1D distribution of blocks across cores
+    auto core_assignments = distribute_work(
+        output.get_legacy_shape().without_padding(),
+        output.get_legacy_shape().padding(),
+        ncores,
+        nblocks_per_core,
+        has_cliff,
+        nblocks_per_core_cliff);
+
+    uint32_t tile_start_id = 0;
+    uint32_t row_start_id = 0;
+    uint32_t ncores_x = grid_size.x;
+
+    const auto& cores = grid_to_cores(ncores, grid_size.x, grid_size.y, true);
+    for (uint32_t i = 0; i < ncores; ++i) {
+        const auto& core = cores[i];
+        const std::vector<BlockRep>& assignment = core_assignments.at(i);
+
+        // reader runtime args
+        vector<uint32_t> reader_rt_args = {
+            src0_buffer->address(),
+            unpadded_row_size_bytes,
+            padded_row_size_bytes,
+            packed_pad_value,
+            row_start_id,
+            static_cast<unsigned int>(assignment.size()),
+        };
+
+        uint32_t nblocks_per_core = 0;
+
+        for (const auto& el : assignment) {
+            nblocks_per_core += el.block_count();
+            row_start_id += el.data_row_count();
+            reader_rt_args.push_back(el.n_data);
+            reader_rt_args.push_back(el.n_mixed);
+            reader_rt_args.push_back(el.n_pads);
+            reader_rt_args.push_back(el.times);
+        }
+
+        uint32_t num_tiles_per_core = num_tiles_per_row * nblocks_per_core;
+
+        // writer runtime args
+        vector<uint32_t> writer_rt_args = {dst_buffer->address(), num_tiles_per_core, tile_start_id};
+
+        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
+        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
+
+        tile_start_id += num_tiles_per_core;
+    }
+
+    auto override_runtime_args_callback =
+        [reader_kernel_id = unary_reader_kernel_id, writer_kernel_id = unary_writer_kernel_id, cores = cores](
+            const Program& program,
+            const std::vector<Buffer*>& input_buffers,
+            const std::vector<Buffer*>& output_buffers) {
+            auto src_buffer = input_buffers.at(0);
+            auto dst_buffer = output_buffers.at(0);
+
+            auto& reader_runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id);
+            auto& writer_runtime_args_by_core = GetRuntimeArgs(program, writer_kernel_id);
+            for (const auto& core : cores) {
+                {
+                    auto& runtime_args = reader_runtime_args_by_core[core.x][core.y];
+                    runtime_args[0] = src_buffer->address();
+                }
+                {
+                    auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
+                    runtime_args[0] = dst_buffer->address();
+                }
+            }
+        };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+// This purely supports input width shard -> output width shard for now
+operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
+    const Tensor& a, Tensor& output, const float pad_value) {
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+
+    bool src_sharded = a.memory_config().is_sharded();
+    bool out_sharded = output.memory_config().is_sharded();
+
+    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
+    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
+
+    Device* device = a.device();
+
+    auto input_shard_spec = a.shard_spec().value();
+    auto output_shard_spec = output.shard_spec().value();
+
+    auto all_cores = output_shard_spec.grid;
+
+    uint32_t num_batches = output.volume() / (output.get_legacy_shape()[-2] * output.get_legacy_shape()[-1]);
+
+    uint32_t num_input_rows = input_shard_spec.shape[0];
+    uint32_t input_shard_width_bytes = input_shard_spec.shape[1] * a.element_size();
+    uint32_t ntiles_per_core = output_shard_spec.shape[0] * output_shard_spec.shape[1] / TILE_HW;
+    uint32_t ntiles_per_batch = ntiles_per_core / num_batches;
+    uint32_t ntiles_per_block = output_shard_spec.shape[1] / TILE_WIDTH;
+    uint32_t nblocks_per_core = output_shard_spec.shape[0] / TILE_HEIGHT;
+    uint32_t num_padded_rows = output.get_legacy_shape()[-2] - a.get_legacy_shape()[-2];
+
+    auto [src0_cb_index, cb_src0] = create_cb(
+        tt::CB::c_in1,
+        program,
+        all_cores,
+        input_shard_width_bytes,
+        num_input_rows,
+        input_cb_data_format,
+        src_sharded ? a.buffer() : nullptr);
+
+    auto [src1_cb_index, cb_src1] = create_cb(
+        tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_batch * 2, input_cb_data_format);
+
+    auto [src2_cb_index, cb_src2] =
+        create_cb(tt::CB::c_in2, program, all_cores, input_shard_width_bytes, 1, input_cb_data_format);
+
+    auto [output_cb_index, cb_output] = create_cb(
+        tt::CB::c_out0,
+        program,
+        all_cores,
+        output_single_tile_size,
+        ntiles_per_core,
+        output_cb_data_format,
+        out_sharded ? output.buffer() : nullptr);
+
+    Buffer* src0_buffer = a.buffer();
+    Buffer* dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    /** reader
+     */
+    KernelHandle unary_reader_kernel_id;
+    std::vector<uint32_t> reader_ct_args = {
+        (std::uint32_t)src0_cb_index,
+        (std::uint32_t)src1_cb_index,
+        (std::uint32_t)src2_cb_index,
+    };
+
+    unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_height_width_sharded.cpp",
+        all_cores,
+        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
+
+    /** writer
+     */
+    KernelHandle unary_writer_kernel_id;
+    bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    vector<uint32_t> writer_ct_args = {
+        output_cb_index,
+    };
+    unary_writer_kernel_id = CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/kernels/dataflow/writer_unary_sharded.cpp",
+        all_cores,
+        WriterDataMovementConfig(writer_ct_args));
+
+    /** compute
+     */
+    vector<uint32_t> compute_args = {
+        (uint32_t)nblocks_per_core,  // per_core_block_cnt
+        (uint32_t)ntiles_per_block,  // per_block_ntiles
+    };
+
+    auto tilize_kernel_id = CreateKernel(
+        program, "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp", all_cores, ComputeConfig{.compile_args = compute_args});
+
+    bfloat16 bfloat_pad_value = bfloat16(pad_value);
+    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
+
+    vector<uint32_t> reader_rt_args = {
+        num_input_rows,
+        input_shard_width_bytes,
+        (num_input_rows / num_batches) * input_shard_width_bytes,
+        ntiles_per_batch,
+        num_padded_rows,
+        num_batches,
+        packed_pad_value};
+    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, all_cores, reader_rt_args);
+
+    vector<uint32_t> writer_rt_args = {ntiles_per_core};
+    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, all_cores, writer_rt_args);
+
+    auto override_runtime_arguments_callback = [reader_kernel_id = unary_reader_kernel_id,
+                                                writer_kernel_id = unary_writer_kernel_id,
+                                                cb_src0 = cb_src0,
+                                                cb_output = cb_output](
+                                                   const void* operation,
+                                                   Program& program,
+                                                   const std::vector<Tensor>& input_tensors,
+                                                   const std::vector<std::optional<const Tensor>>&,
+                                                   const std::vector<Tensor>& output_tensors) {
+        auto src_buffer = input_tensors.at(0).buffer();
+        auto dst_buffer = output_tensors.at(0).buffer();
+
+        UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer);
+        UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer);
+    };
+
+    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback};
+}
+
+operation::ProgramWithCallbacks tilize_with_val_padding_multi_core(
+    const Tensor& a, Tensor& output, const float pad_value) {
+    if (a.memory_config().is_sharded()) {
+        return tilize_with_val_padding_multi_core_sharded(a, output, pad_value);
+    } else {
+        return tilize_with_val_padding_multi_core_interleaved(a, output, pad_value);
+    }
+}
+
+}  // namespace ttnn::operations::data_movement::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
index 7db439aa9df..be4cb3beccf 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
@@ -4,14 +4,6 @@
 
 #pragma once
 
-#include <math.h>
-
-#include "ttnn/deprecated/tt_dnn/op_library/cb_utils.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/math.hpp"
-#include "ttnn/operation.hpp"
-#include "ttnn/deprecated/tt_dnn/op_library/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
@@ -19,474 +11,11 @@ using namespace tt::constants;
 namespace ttnn::operations::data_movement::detail {
 
 operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
-    const Tensor& a, Tensor& output, const float pad_value) {
-    auto output_shape = output.get_legacy_shape();
-
-    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    CoreRange core({0, 0}, {0, 0});
-
-    // This should allocate a DRAM buffer on the device
-    tt::tt_metal::Device* device = a.device();
-
-    tt::tt_metal::Buffer* src0_buffer = a.buffer();
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    int32_t num_tiles = output.volume() / TILE_HW;
-
-    auto true_input_shape = a.get_legacy_shape();
-    auto true_output_shape = output.get_legacy_shape();
-
-    auto input_w = true_input_shape.rank() >= 4 ? true_input_shape[-4] : 1;
-    auto input_z = true_input_shape.rank() >= 3 ? true_input_shape[-3] : 1;
-    auto input_y = true_input_shape.rank() >= 2 ? true_input_shape[-2] : 1;
-    auto input_x = true_input_shape[-1];
-
-    auto output_w = true_output_shape.rank() >= 4 ? true_output_shape[-4] : 1;
-    auto output_z = true_output_shape.rank() >= 3 ? true_output_shape[-3] : 1;
-    auto output_y = true_output_shape.rank() >= 2 ? true_output_shape[-2] : 1;
-    auto output_x = true_output_shape[-1];
-
-    uint32_t unpadded_row_size_bytes = input_x * a.element_size();  // Assuming bfloat16 dataformat
-    uint32_t padded_row_size_bytes = output_x * a.element_size();   // Assuming bfloat16 dataformat
-
-    constexpr uint32_t alignment = 32;
-
-    uint32_t num_tiles_in_row = output_x / TILE_WIDTH;
-    // Ensure we don't intrude into storage space
-    uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - L1_UNRESERVED_BASE;
-    // Memory usage is 2 CBs of width W, plus buffer of size alignment + (W * datum size)
-    uint32_t max_X = (max_l1_size - alignment) / (a.element_size() * TILE_HEIGHT * 2 + a.element_size());
-    uint32_t max_tiles = max_X / TILE_WIDTH;
-
-    // Currently need the number of tiles in a row to be divisible by tiles in a block
-    uint32_t num_tiles_per_block = 1;
-    if (num_tiles_in_row <= max_tiles) {
-        num_tiles_per_block = num_tiles_in_row;
-    } else {
-        for (uint32_t n_t = max_tiles; n_t > 0; n_t--) {
-            if (num_tiles_in_row % n_t == 0) {
-                num_tiles_per_block = n_t;
-                break;
-            }
-        }
-    }
-
-    uint32_t block_width = num_tiles_per_block * TILE_WIDTH;
-    uint32_t block_row_size = block_width * a.element_size();
-    uint32_t num_blocks_w_output = padded_row_size_bytes / block_row_size;
-    uint32_t num_blocks_w_input = unpadded_row_size_bytes / block_row_size;
-
-    // Leftover size if input is not divisible by block size
-    uint32_t block_row_leftover_size = unpadded_row_size_bytes - num_blocks_w_input * block_row_size;
-
-    // Number of blocks that differ between input and output
-    const uint32_t num_blocks_w_diff = num_blocks_w_output - num_blocks_w_input - (block_row_leftover_size > 0 ? 1 : 0);
-
-    const uint32_t padded_Y_diff_blocks = (output_y - input_y) / TILE_HEIGHT * num_blocks_w_output;
-    const uint32_t padded_Z_diff_blocks = (output_z - input_z) * output_y / TILE_HEIGHT * num_blocks_w_output;
-    const uint32_t padded_W_diff_blocks =
-        (output_w - input_w) * output_z * output_y / TILE_HEIGHT * num_blocks_w_output;
-    const uint32_t num_leftover_Y = input_y - input_y / TILE_HEIGHT * TILE_HEIGHT;
-
-    tt::tt_metal::Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    uint32_t src0_cb_index = 0;
-    uint32_t num_input_tiles = num_tiles_per_block;
-    assert(num_input_tiles > 0);
-    tt::tt_metal::CircularBufferConfig src0_cb_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_input_tiles * input_single_tile_size, {{src0_cb_index, input_cb_data_format}})
-            .set_page_size(src0_cb_index, input_single_tile_size);
-    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
-
-    uint32_t output_cb_index = 16;  // output operands start at index 16
-    uint32_t num_output_tiles = num_tiles_per_block;
-    tt::tt_metal::CircularBufferConfig cb_output_config =
-        tt::tt_metal::CircularBufferConfig(
-            num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
-            .set_page_size(output_cb_index, output_single_tile_size);
-    auto cb_output = tt::tt_metal::CreateCircularBuffer(program, core, cb_output_config);
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    vector<uint32_t> reader_kernel_args = {
-        src0_buffer->address(),
-        input_w,
-        padded_W_diff_blocks,
-        input_z,
-        padded_Z_diff_blocks,
-        input_y,
-        padded_Y_diff_blocks,
-        num_leftover_Y,
-        input_x,
-        unpadded_row_size_bytes,
-        padded_row_size_bytes,
-        packed_pad_value,
-        num_blocks_w_input,
-        num_blocks_w_output,
-        num_blocks_w_diff,
-        block_row_size,
-        block_row_leftover_size};
-
-    // Reader compile-time args
-    uint32_t src0_is_dram = src0_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    uint32_t stick_size = unpadded_row_size_bytes;
-    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
-    uint32_t log2_stick_size = stick_size_is_power_of_two ? (uint32_t)log2(stick_size) : 0;
-    std::vector<uint32_t> reader_compile_time_args = {src0_is_dram, stick_size_is_power_of_two, log2_stick_size};
-
-    // Tilized reader
-    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_dims_split_rows.cpp",
-        core,
-        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
-
-    // Tilized writer
-    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
-        core,
-        tt::tt_metal::WriterDataMovementConfig({output_cb_index, out_is_dram}));
-
-    vector<uint32_t> compute_kernel_args = {uint32_t(num_tiles / num_tiles_per_block), uint32_t(num_tiles_per_block)};
-
-    auto tilize_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-        core,
-        tt::tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
-
-    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args);
-
-    tt::tt_metal::SetRuntimeArgs(
-        program, unary_writer_kernel_id, core, {dst_buffer->address(), (uint32_t)num_tiles, 0});
-
-    auto override_runtime_args_callback = [reader_kernel_id = unary_reader_kernel_id,
-                                           writer_kernel_id = unary_writer_kernel_id](
-                                              const Program& program,
-                                              const std::vector<Buffer*>& input_buffers,
-                                              const std::vector<Buffer*>& output_buffers) {
-        auto src_buffer = input_buffers.at(0);
-
-        auto dst_buffer = output_buffers.at(0);
-
-        CoreCoord core = {0, 0};
-
-        {
-            auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
-            runtime_args[0] = src_buffer->address();
-        }
-
-        {
-            auto& runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
-            runtime_args[0] = dst_buffer->address();
-        }
-    };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_interleaved(
-    const Tensor& a, Tensor& output, const float pad_value) {
-    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    Device* device = a.device();
-    CoreCoord grid_size = device->compute_with_storage_grid_size();
-
-    uint32_t num_blocks = output.volume() / output.get_legacy_shape()[-1] / TILE_HEIGHT;
-    uint32_t num_tiles_per_row = output.get_legacy_shape()[-1] / TILE_WIDTH;
-
-    auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
-        split_blocks_for_tilize(grid_size, num_blocks);
-
-    bool has_cliff = core_range_cliff.size() > 0;
-
-    uint32_t unpadded_row_size_bytes = a.get_legacy_shape()[-1] * a.element_size();     // Assuming bfloat16 dataformat
-    uint32_t padded_row_size_bytes = output.get_legacy_shape()[-1] * a.element_size();  // Assuming bfloat16 dataformat
-
-    auto [src0_cb_index, cb_src0] =
-        create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
-
-    auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
-
-    Buffer* src0_buffer = a.buffer();
-    Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    /** reader
-     */
-    uint32_t src0_is_dram = src0_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    uint32_t stick_size = unpadded_row_size_bytes;
-    uint32_t stick_size_is_power_of_two = is_power_of_two_at_least_32(stick_size);
-    uint32_t log2_stick_size = stick_size_is_power_of_two ? (std::uint32_t)std::log2(stick_size) : 0;
-
-    KernelHandle unary_reader_kernel_id = CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_dims_split_rows_multicore.cpp",
-        all_cores,
-        ReaderDataMovementConfig({src0_is_dram, stick_size_is_power_of_two, log2_stick_size}));
-
-    /** writer
-     */
-    uint32_t out_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-
-    KernelHandle unary_writer_kernel_id = CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
-        all_cores,
-        WriterDataMovementConfig({output_cb_index, out_is_dram}));
-
-    /** compute
-     */
-    if (core_range.size() > 0) {
-        auto tilize_kernel_id = CreateKernel(
-            program,
-            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-            core_range,
-            ComputeConfig{.compile_args = {nblocks_per_core, num_tiles_per_row}});
-    }
-    if (has_cliff) {
-        auto tilize_cliff_kernel_id = CreateKernel(
-            program,
-            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp",
-            core_range_cliff,
-            ComputeConfig{.compile_args = {nblocks_per_core_cliff, num_tiles_per_row}});
-    }
-
-    /* RUNTIME ARGS */
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    // 1D distribution of blocks across cores
-    auto core_assignments = distribute_work(
-        output.get_legacy_shape().without_padding(),
-        output.get_legacy_shape().padding(),
-        ncores,
-        nblocks_per_core,
-        has_cliff,
-        nblocks_per_core_cliff);
-
-    uint32_t tile_start_id = 0;
-    uint32_t row_start_id = 0;
-    uint32_t ncores_x = grid_size.x;
-
-    const auto& cores = grid_to_cores(ncores, grid_size.x, grid_size.y, true);
-    for (uint32_t i = 0; i < ncores; ++i) {
-        const auto& core = cores[i];
-        const std::vector<BlockRep>& assignment = core_assignments.at(i);
-
-        // reader runtime args
-        vector<uint32_t> reader_rt_args = {
-            src0_buffer->address(),
-            unpadded_row_size_bytes,
-            padded_row_size_bytes,
-            packed_pad_value,
-            row_start_id,
-            static_cast<unsigned int>(assignment.size()),
-        };
-
-        uint32_t nblocks_per_core = 0;
-
-        for (const auto& el : assignment) {
-            nblocks_per_core += el.block_count();
-            row_start_id += el.data_row_count();
-            reader_rt_args.push_back(el.n_data);
-            reader_rt_args.push_back(el.n_mixed);
-            reader_rt_args.push_back(el.n_pads);
-            reader_rt_args.push_back(el.times);
-        }
-
-        uint32_t num_tiles_per_core = num_tiles_per_row * nblocks_per_core;
-
-        // writer runtime args
-        vector<uint32_t> writer_rt_args = {dst_buffer->address(), num_tiles_per_core, tile_start_id};
-
-        SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_rt_args);
-        SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_rt_args);
-
-        tile_start_id += num_tiles_per_core;
-    }
-
-    auto override_runtime_args_callback =
-        [reader_kernel_id = unary_reader_kernel_id, writer_kernel_id = unary_writer_kernel_id, cores = cores](
-            const Program& program,
-            const std::vector<Buffer*>& input_buffers,
-            const std::vector<Buffer*>& output_buffers) {
-            auto src_buffer = input_buffers.at(0);
-            auto dst_buffer = output_buffers.at(0);
-
-            auto& reader_runtime_args_by_core = GetRuntimeArgs(program, reader_kernel_id);
-            auto& writer_runtime_args_by_core = GetRuntimeArgs(program, writer_kernel_id);
-            for (const auto& core : cores) {
-                {
-                    auto& runtime_args = reader_runtime_args_by_core[core.x][core.y];
-                    runtime_args[0] = src_buffer->address();
-                }
-                {
-                    auto& runtime_args = writer_runtime_args_by_core[core.x][core.y];
-                    runtime_args[0] = dst_buffer->address();
-                }
-            }
-        };
-
-    return {std::move(program), override_runtime_args_callback};
-}
-
-// This purely supports input width shard -> output width shard for now
-operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
-    const Tensor& a, Tensor& output, const float pad_value) {
-    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    bool src_sharded = a.memory_config().is_sharded();
-    bool out_sharded = output.memory_config().is_sharded();
-
-    tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
-    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
-    tt::DataFormat output_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(output.get_dtype());
-    uint32_t output_single_tile_size = tt::tt_metal::detail::TileSize(output_cb_data_format);
-
-    Device* device = a.device();
-
-    auto input_shard_spec = a.shard_spec().value();
-    auto output_shard_spec = output.shard_spec().value();
-
-    auto all_cores = output_shard_spec.grid;
-
-    uint32_t num_batches = output.volume() / (output.get_legacy_shape()[-2] * output.get_legacy_shape()[-1]);
-
-    uint32_t num_input_rows = input_shard_spec.shape[0];
-    uint32_t input_shard_width_bytes = input_shard_spec.shape[1] * a.element_size();
-    uint32_t ntiles_per_core = output_shard_spec.shape[0] * output_shard_spec.shape[1] / TILE_HW;
-    uint32_t ntiles_per_batch = ntiles_per_core / num_batches;
-    uint32_t ntiles_per_block = output_shard_spec.shape[1] / TILE_WIDTH;
-    uint32_t nblocks_per_core = output_shard_spec.shape[0] / TILE_HEIGHT;
-    uint32_t num_padded_rows = output.get_legacy_shape()[-2] - a.get_legacy_shape()[-2];
-
-    auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in1,
-        program,
-        all_cores,
-        input_shard_width_bytes,
-        num_input_rows,
-        input_cb_data_format,
-        src_sharded ? a.buffer() : nullptr);
-
-    auto [src1_cb_index, cb_src1] = create_cb(
-        tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_batch * 2, input_cb_data_format);
-
-    auto [src2_cb_index, cb_src2] =
-        create_cb(tt::CB::c_in2, program, all_cores, input_shard_width_bytes, 1, input_cb_data_format);
-
-    auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
-        program,
-        all_cores,
-        output_single_tile_size,
-        ntiles_per_core,
-        output_cb_data_format,
-        out_sharded ? output.buffer() : nullptr);
-
-    Buffer* src0_buffer = a.buffer();
-    Buffer* dst_buffer = output.buffer();
-    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
-
-    /** reader
-     */
-    KernelHandle unary_reader_kernel_id;
-    std::vector<uint32_t> reader_ct_args = {
-        (std::uint32_t)src0_cb_index,
-        (std::uint32_t)src1_cb_index,
-        (std::uint32_t)src2_cb_index,
-    };
-
-    unary_reader_kernel_id = tt::tt_metal::CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/kernels/dataflow/reader_unary_pad_height_width_sharded.cpp",
-        all_cores,
-        tt::tt_metal::ReaderDataMovementConfig(reader_ct_args));
-
-    /** writer
-     */
-    KernelHandle unary_writer_kernel_id;
-    bool out_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
-    vector<uint32_t> writer_ct_args = {
-        output_cb_index,
-    };
-    unary_writer_kernel_id = CreateKernel(
-        program,
-        "ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/kernels/dataflow/writer_unary_sharded.cpp",
-        all_cores,
-        WriterDataMovementConfig(writer_ct_args));
-
-    /** compute
-     */
-    vector<uint32_t> compute_args = {
-        (uint32_t)nblocks_per_core,  // per_core_block_cnt
-        (uint32_t)ntiles_per_block,  // per_block_ntiles
-    };
-
-    auto tilize_kernel_id = CreateKernel(
-        program, "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp", all_cores, ComputeConfig{.compile_args = compute_args});
-
-    bfloat16 bfloat_pad_value = bfloat16(pad_value);
-    uint32_t packed_pad_value = pack_two_bfloat16_into_uint32({bfloat_pad_value, bfloat_pad_value});
-
-    vector<uint32_t> reader_rt_args = {
-        num_input_rows,
-        input_shard_width_bytes,
-        (num_input_rows / num_batches) * input_shard_width_bytes,
-        ntiles_per_batch,
-        num_padded_rows,
-        num_batches,
-        packed_pad_value};
-    tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, all_cores, reader_rt_args);
-
-    vector<uint32_t> writer_rt_args = {ntiles_per_core};
-    tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, all_cores, writer_rt_args);
-
-    auto override_runtime_arguments_callback = [reader_kernel_id = unary_reader_kernel_id,
-                                                writer_kernel_id = unary_writer_kernel_id,
-                                                cb_src0 = cb_src0,
-                                                cb_output = cb_output](
-                                                   const void* operation,
-                                                   Program& program,
-                                                   const std::vector<Tensor>& input_tensors,
-                                                   const std::vector<std::optional<const Tensor>>&,
-                                                   const std::vector<Tensor>& output_tensors) {
-        auto src_buffer = input_tensors.at(0).buffer();
-        auto dst_buffer = output_tensors.at(0).buffer();
+    const Tensor& a, Tensor& output, const float pad_value);
 
-        UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer);
-        UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer);
-    };
 
-    return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback};
-}
 
 operation::ProgramWithCallbacks tilize_with_val_padding_multi_core(
-    const Tensor& a, Tensor& output, const float pad_value) {
-    if (a.memory_config().is_sharded()) {
-        return tilize_with_val_padding_multi_core_sharded(a, output, pad_value);
-    } else {
-        return tilize_with_val_padding_multi_core_interleaved(a, output, pad_value);
-    }
-}
+    const Tensor& a, Tensor& output, const float pad_value); 
 
 }  // namespace ttnn::operations::data_movement::detail