diff --git a/tt_metal/programming_examples/CMakeLists.txt b/tt_metal/programming_examples/CMakeLists.txt
index 6261c05e675..b01c7e8c915 100644
--- a/tt_metal/programming_examples/CMakeLists.txt
+++ b/tt_metal/programming_examples/CMakeLists.txt
@@ -12,6 +12,8 @@ set(PROGRAMMING_EXAMPLES_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/matmul_multicore_reuse/matmul_multicore_reuse.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/matmul_single_core/matmul_single_core.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp
 )
 
 include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake)
diff --git a/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp b/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp
new file mode 100644
index 00000000000..79a591d4dba
--- /dev/null
+++ b/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+    const uint32_t src_addr = get_arg_val<uint32_t>(0);
+    const uint32_t pad_addr = get_arg_val<uint32_t>(1);
+    const uint32_t start_src_stick_id = get_arg_val<uint32_t>(2);
+    const uint32_t row_size_diff = get_arg_val<uint32_t>(3);
+    const uint32_t dst_N = get_arg_val<uint32_t>(4);
+    const uint32_t data_size_bytes = get_arg_val<uint32_t>(5);
+    const uint32_t num_rows_per_core = get_arg_val<uint32_t>(6);
+
+    constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool pad_is_dram = get_compile_time_arg_val(1) == 1;
+    constexpr uint32_t cb_id = tt::CB::c_in0;
+
+    const InterleavedAddrGen<src_is_dram> s0 = {
+        .bank_base_address = src_addr,
+        .page_size = data_size_bytes
+    };
+    const InterleavedAddrGen<pad_is_dram> s1 = {
+        .bank_base_address = pad_addr,
+        .page_size = data_size_bytes
+    };
+
+    // pad based on page
+    uint32_t src_stick_id = start_src_stick_id;
+    uint32_t src_start_col_idx = row_size_diff / 2;
+    uint32_t src_end_col_idx = dst_N - src_start_col_idx;
+    for (uint32_t i = 0; i < num_rows_per_core; i++) {
+        for (uint32_t dst_col_idx = 0; dst_col_idx < dst_N; dst_col_idx++) {
+            cb_reserve_back(cb_id, 1);
+            uint32_t l1_addr = get_write_ptr(cb_id);
+            if (dst_col_idx < src_start_col_idx || dst_col_idx >= src_end_col_idx) {
+                // add pad value to cb
+                uint64_t pad_noc_addr = get_noc_addr(0, s1);
+                noc_async_read(pad_noc_addr, l1_addr, data_size_bytes);
+            }
+            else {
+                // add original src data to cb
+                uint64_t src_noc_addr = get_noc_addr(src_stick_id, s0);
+                noc_async_read(src_noc_addr, l1_addr, data_size_bytes);
+                src_stick_id++;
+            }
+            noc_async_read_barrier();
+            cb_push_back(cb_id, 1);
+            l1_addr += data_size_bytes;
+        }
+    }
+}
diff --git a/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp b/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp
new file mode 100644
index 00000000000..5f5a5e080c3
--- /dev/null
+++ b/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+
+    const uint32_t dst_addr = get_arg_val<uint32_t>(0);
+    const uint32_t start_dst_stick_id = get_arg_val<uint32_t>(1);
+    const uint32_t dst_N = get_arg_val<uint32_t>(2);
+    const uint32_t data_size_bytes = get_arg_val<uint32_t>(3);
+    const uint32_t num_rows_per_core = get_arg_val<uint32_t>(4);
+
+    constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr uint32_t cb_id = tt::CB::c_in0;
+
+    const InterleavedAddrGen<dst_is_dram> s0 = {
+        .bank_base_address = dst_addr,
+        .page_size = data_size_bytes
+    };
+
+    uint32_t dst_stick_id = start_dst_stick_id;
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; row_idx++) {
+        for (uint32_t dst_col_idx = 0; dst_col_idx < dst_N; dst_col_idx++) {
+            cb_wait_front(cb_id, 1);
+            uint32_t l1_addr = get_read_ptr(cb_id);
+            uint64_t dst_noc_addr = get_noc_addr(dst_stick_id, s0);
+            noc_async_write(l1_addr, dst_noc_addr, data_size_bytes);
+            noc_async_write_barrier();
+            dst_stick_id++;
+            cb_pop_front(cb_id, 1);
+        }
+    }
+
+}
diff --git a/tt_metal/programming_examples/pad/pad_multi_core.cpp b/tt_metal/programming_examples/pad/pad_multi_core.cpp
new file mode 100644
index 00000000000..74aaf317141
--- /dev/null
+++ b/tt_metal/programming_examples/pad/pad_multi_core.cpp
@@ -0,0 +1,178 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/common/bfloat16.hpp"
+#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/impl/device/device.hpp"
+
+using namespace tt;
+using namespace tt::tt_metal;
+
+int main(int argc, char **argv) {
+    // get program/device
+    int device_id = 0;
+    Device *device = CreateDevice(device_id);
+    CommandQueue& cq = device->command_queue();
+    Program program = CreateProgram();
+
+    // initialize source data
+    constexpr uint32_t src_M = 8;
+    constexpr uint32_t src_N = 4;
+    constexpr uint32_t packed_data_size = sizeof(uint32_t);
+    constexpr uint32_t unpacked_data_size = sizeof(bfloat16);
+    constexpr uint32_t packing_ratio = packed_data_size / unpacked_data_size;
+    uint32_t src_num_values_unpacked = src_M * src_N;
+    uint32_t src_num_values_packed = src_num_values_unpacked / packing_ratio;
+    std::vector<uint32_t> src_vec(src_num_values_packed, 0);
+    // source vector = {1, 2, 3, ... , 30, 31, 32}
+    for (uint32_t i = 0; i < src_vec.size(); i++) {
+        bfloat16 bfloat_val1 = bfloat16(2 * i + 1);
+        bfloat16 bfloat_val2 = bfloat16(2 * i + 2);
+        src_vec[i] = pack_two_bfloat16_into_uint32(std::pair<bfloat16, bfloat16>(bfloat_val1, bfloat_val2));
+    }
+
+    // create pad vector
+    bfloat16 pad_value = bfloat16(2);
+    std::vector<uint32_t> pad_vec(1, pack_two_bfloat16_into_uint32(std::pair<bfloat16, bfloat16>(pad_value, pad_value)));
+
+    // create destination vector
+    constexpr uint32_t dst_M = 8;
+    constexpr uint32_t dst_N = 8;
+    uint32_t dst_num_values_unpacked = dst_M * dst_N;
+    uint32_t dst_num_values_packed = dst_num_values_unpacked / packing_ratio;
+    std::vector<uint32_t> dst_vec(dst_num_values_packed, 0);
+
+    // designate cores and core specs
+    CoreCoord start_core = {0, 0};
+    CoreCoord end_core = {0, 3};
+    CoreRange cores(start_core, end_core);
+    uint32_t num_cores = cores.size();
+
+    // configure and create DRAM buffers for input, pad, output
+    uint32_t src_buffer_size = packed_data_size * src_num_values_packed;
+    tt_metal::InterleavedBufferConfig input_dram_config {
+        .device = device,
+        .size = src_buffer_size,
+        .page_size = packed_data_size,
+        .buffer_type = tt_metal::BufferType::DRAM
+    };
+    std::shared_ptr<tt::tt_metal::Buffer> src_buffer = CreateBuffer(input_dram_config);
+    uint32_t src_addr = src_buffer->address();
+
+    uint32_t pad_buffer_size = packed_data_size * pad_vec.size();
+    tt_metal::InterleavedBufferConfig pad_dram_config {
+        .device = device,
+        .size = pad_buffer_size,
+        .page_size = packed_data_size,
+        .buffer_type = tt_metal::BufferType::DRAM
+    };
+    std::shared_ptr<tt::tt_metal::Buffer> pad_buffer = CreateBuffer(pad_dram_config);
+    uint32_t pad_addr = pad_buffer->address();
+
+    uint32_t dst_buffer_size = packed_data_size * dst_num_values_packed;
+    tt_metal::InterleavedBufferConfig output_dram_config {
+        .device = device,
+        .size = dst_buffer_size,
+        .page_size = packed_data_size,
+        .buffer_type = tt_metal::BufferType::DRAM
+    };
+    std::shared_ptr<tt::tt_metal::Buffer> dst_buffer = CreateBuffer(output_dram_config);
+    uint32_t dst_addr = dst_buffer->address();
+
+    // configure and create circular buffer
+    uint32_t cb_id = CB::c_in0;
+    tt::DataFormat cb_data_format = tt::DataFormat::UInt32;
+    CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(dst_N * packed_data_size * 2, {{cb_id, cb_data_format}})
+		.set_page_size(cb_id, packed_data_size);
+    auto cb_src = tt::tt_metal::CreateCircularBuffer(program, cores, cb_config);
+
+    // specify compile time args
+    bool src_is_dram = src_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool pad_is_dram = pad_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> reader_compile_time_args = {(uint32_t) src_is_dram,
+                                            (uint32_t) pad_is_dram};
+    std::vector<uint32_t> writer_compile_time_args = {(uint32_t) dst_is_dram};
+
+    // create kernels
+    KernelHandle reader_id = CreateKernel(program,
+                                          "tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp",
+                                          cores,
+                                          tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = reader_compile_time_args});
+    KernelHandle writer_id = CreateKernel(program,
+                                          "tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp",
+                                          cores,
+                                          tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = writer_compile_time_args});
+
+    // set kernel runtime arguments
+    uint32_t start_src_idx = 0;
+    uint32_t start_dst_idx = 0;
+    uint32_t num_rows_per_core = src_M / num_cores;
+    uint32_t row_size_diff = dst_N - src_N;
+    uint32_t num_packed_row_src = src_N / packing_ratio;
+    uint32_t num_packed_row_dst = dst_N / packing_ratio;
+    uint32_t num_src_sticks_per_core = num_packed_row_src * num_rows_per_core;
+    for (uint32_t core_idx = 0; core_idx < num_cores; core_idx++) {
+        CoreCoord core = {0, core_idx};
+        tt_metal::SetRuntimeArgs(
+            program,
+            reader_id,
+            core,
+            {src_addr,
+             pad_addr,
+             start_src_idx,
+             row_size_diff / packing_ratio,
+             num_packed_row_dst,
+             packed_data_size,
+             num_rows_per_core
+            }
+        );
+        tt_metal::SetRuntimeArgs(
+            program,
+            writer_id,
+            core,
+            {dst_addr,
+             start_dst_idx,
+             num_packed_row_dst,
+             packed_data_size,
+             num_rows_per_core
+            }
+        );
+        start_src_idx += num_src_sticks_per_core;
+        start_dst_idx += num_packed_row_dst * num_rows_per_core;
+    }
+
+    printf("Padding tensor of shape (%d, %d) to shape (%d, %d) with pad value: %d\n", src_M, src_N, dst_M, dst_N, pad_value.to_uint16());
+    printf("Original tensor with shape (%d, %d):\n", src_M, src_N);
+    for (uint32_t m = 0; m < src_M; m++) {
+        for (uint32_t n = 0; n < num_packed_row_src; n++) {
+            printf("%d ", (uint16_t)src_vec[m * num_packed_row_src + n]);
+            printf("%d ", (uint16_t)(src_vec[m * num_packed_row_src + n] >> 16));
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    // dispatch program to device for execution
+    EnqueueWriteBuffer(cq, src_buffer, src_vec.data(), false);
+    EnqueueWriteBuffer(cq, pad_buffer, pad_vec.data(), false);
+    EnqueueProgram(cq, program, false);
+    EnqueueReadBuffer(cq, dst_buffer, dst_vec.data(), false);
+    Finish(cq);
+
+    printf("Padded tensor with shape (%d, %d):\n", dst_M, dst_N);
+    for (uint32_t m = 0; m < dst_M; m++) {
+        for (uint32_t n = 0; n < num_packed_row_dst; n++) {
+            printf("%d ", (uint16_t)dst_vec[m * num_packed_row_dst + n]);
+            printf("%d ", (uint16_t)(dst_vec[m * num_packed_row_dst + n] >> 16));
+        }
+        printf("\n");
+    }
+
+    CloseDevice(device);
+}
diff --git a/tt_metal/programming_examples/sharding/kernels/reader_sharded_rm.cpp b/tt_metal/programming_examples/sharding/kernels/reader_sharded_rm.cpp
new file mode 100644
index 00000000000..ff4487eb76a
--- /dev/null
+++ b/tt_metal/programming_examples/sharding/kernels/reader_sharded_rm.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+// export TT_METAL_DPRINT_CORES='(0,0)-(0,3)' in order to see DPRINT messages
+
+void kernel_main() {
+    const uint32_t src_addr                 = get_arg_val<uint32_t>(0);
+    const uint32_t stick_size               = get_arg_val<uint32_t>(1);
+    const uint32_t shard_height             = get_arg_val<uint32_t>(2);
+    const uint32_t shard_width_bytes        = get_arg_val<uint32_t>(3);
+    const uint32_t padded_offset_bytes      = get_arg_val<uint32_t>(4);
+    const uint32_t start_id                 = get_arg_val<uint32_t>(5);
+    const uint32_t current_core             = get_arg_val<uint32_t>(6);
+
+    constexpr uint32_t cb_id_in0 = get_compile_time_arg_val(0);
+    constexpr bool src_is_dram = get_compile_time_arg_val(1) == 1;
+    const InterleavedAddrGen<src_is_dram> s0 = {
+        .bank_base_address = src_addr,
+        .page_size = stick_size
+    };
+    uint32_t stick_id = start_id;
+    cb_reserve_back(cb_id_in0, shard_height);
+    uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+    DPRINT_DATA0(DPRINT << "Core (0," << current_core << "): ");
+    for (uint32_t h = 0; h < shard_height; ++h) {
+        uint64_t src_noc_addr = get_noc_addr(stick_id, s0);
+        noc_async_read(src_noc_addr, l1_write_addr, stick_size);
+        // print both BFloat16 values that are packed into the page
+        uint32_t* read_ptr = (uint32_t*)l1_write_addr;
+        DPRINT_DATA0(DPRINT << (uint16_t)*read_ptr << " ");
+        DPRINT_DATA0(DPRINT << (uint16_t)(*read_ptr >> 16) << " ");
+        stick_id++;
+        l1_write_addr += padded_offset_bytes;
+    }
+    DPRINT_DATA0(DPRINT << ENDL());
+    noc_async_read_barrier();
+    cb_push_back(cb_id_in0, shard_height);
+}
diff --git a/tt_metal/programming_examples/sharding/shard_data_rm.cpp b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
new file mode 100644
index 00000000000..9d5a8036662
--- /dev/null
+++ b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/common/bfloat16.hpp"
+#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/impl/device/device.hpp"
+
+using namespace tt;
+using namespace tt::tt_metal;
+
+int main(int argc, char **argv) {
+    // get program/device
+    int device_id = 0;
+    Device *device = CreateDevice(device_id);
+    CommandQueue& cq = device->command_queue();
+    Program program = CreateProgram();
+
+    // initialize source data - vector with shape (16, 1)
+    constexpr uint32_t M = 16;
+    constexpr uint32_t N = 1;
+    uint32_t num_values = M * N;
+    tt::DataFormat cb_data_format = tt::DataFormat::Float16_b;
+    std::vector<bfloat16> src_vec(num_values, bfloat16(0.0f));
+    // source vector - {2, 4, 6, ... , 28, 30, 32}
+    for (uint32_t i = 0; i < src_vec.size(); i++) {
+        src_vec[i] = bfloat16((i + 1) * 2);
+    }
+    constexpr size_t data_size = sizeof(bfloat16);
+
+    // designate 4 cores for utilization - cores (0,0), (0,1), (0,2), (0,3)
+    CoreCoord start_core = {0, 0};
+    CoreCoord end_core = {0, 3};
+    uint32_t num_cores = 4;
+    CoreRange cores(start_core, end_core);
+
+    // define shard specs - 16 values total and 4 cores -> 4 values per core
+    uint32_t shard_height = num_values / num_cores / data_size;
+    uint32_t shard_width = N;
+    uint32_t shard_size = shard_height * shard_width;
+    uint32_t input_unit_size = sizeof(uint32_t);
+    uint32_t shard_width_bytes = shard_width * data_size;
+    uint32_t num_units_per_row = shard_width * input_unit_size;
+    uint32_t padded_offset_bytes = align(input_unit_size, ALLOCATOR_ALIGNMENT);
+
+    // configure and create interleaved DRAM buffer to insert source data into
+    uint32_t src_buffer_size = input_unit_size * num_values / data_size;
+    tt_metal::InterleavedBufferConfig input_dram_config {
+        .device = device,
+        .size = src_buffer_size,
+        .page_size = input_unit_size,
+        .buffer_type = tt_metal::BufferType::DRAM
+    };
+    std::shared_ptr<tt::tt_metal::Buffer> src_buffer = CreateBuffer(input_dram_config);
+    uint32_t src_addr = src_buffer->address();
+
+    // configure and create circular buffers with the same address on each of the designated cores
+    bool src_is_dram = src_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
+    uint32_t input_cb_index = CB::c_in0;
+    CircularBufferConfig input_cb_config = CircularBufferConfig(shard_size * input_unit_size, {{input_cb_index, cb_data_format}})
+		.set_page_size(input_cb_index, input_unit_size);
+    auto cb_input = tt_metal::CreateCircularBuffer(program, cores, input_cb_config);
+
+    // create data movement kernel to shard data
+    std::vector<uint32_t> reader_compile_time_args = {
+        (std::uint32_t)input_cb_index,
+        (std::uint32_t)src_is_dram};
+    auto reader_id = tt_metal::CreateKernel(
+        program,
+        "tt_metal/programming_examples/sharding/kernels/reader_sharded_rm.cpp",
+        cores,
+        tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = reader_compile_time_args});
+
+    // set runtime arguments for each core
+    uint32_t curr_idx_h = 0;
+    uint32_t curr_idx_w = 0;
+    for (uint32_t i = 0; i < num_cores; i++) {
+        CoreCoord core = {0, i};
+        tt_metal::SetRuntimeArgs(
+            program,
+            reader_id,
+            core,
+            {src_addr,
+            input_unit_size,
+            shard_height,
+            shard_width_bytes,
+            padded_offset_bytes,
+            curr_idx_h,
+            i});
+        curr_idx_w += input_unit_size;
+        if (curr_idx_w >= num_units_per_row) {
+            curr_idx_w = 0;
+            curr_idx_h += shard_height;
+        }
+    }
+
+    printf("Original tensor values: ");
+    for (uint32_t src_vec_idx = 0; src_vec_idx < src_vec.size(); src_vec_idx++) {
+        printf("%0.f ", src_vec[src_vec_idx].to_float());
+    }
+    printf("\n");
+
+    // start/finish program and close device
+    EnqueueWriteBuffer(cq, src_buffer, src_vec.data(), false);
+    EnqueueProgram(cq, program, false);
+    Finish(cq);
+    CloseDevice(device);
+}