diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 672e458b7..c02765224 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -55,7 +55,11 @@
 #include "ttnn/operations/embedding/embedding.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
+#include "ttnn/operations/pool/maxpool/max_pool2d.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/tensor/types.hpp"
+
 #pragma clang diagnostic pop
 
 #include "tt/runtime/types.h"
diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt
index ad2542456..98aa7fc98 100644
--- a/runtime/lib/ttnn/CMakeLists.txt
+++ b/runtime/lib/ttnn/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(operations)
 add_library(TTRuntimeTTNN
   STATIC
   runtime.cpp
@@ -5,11 +6,10 @@ add_library(TTRuntimeTTNN
 )
 # We have to set the C++ standard to 20 because tt-metal requires it
 set_property(TARGET TTRuntimeTTNN PROPERTY CXX_STANDARD 20)
-target_compile_options(TTRuntimeTTNN PRIVATE -mavx -mavx2)
 target_include_directories(TTRuntimeTTNN PUBLIC
   ${PROJECT_SOURCE_DIR}/runtime/include
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
 target_include_directories(TTRuntimeTTNN PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTNN PUBLIC TTNN_LIBRARY)
-add_dependencies(TTRuntimeTTNN TTNN_LIBRARY tt-metal FBS_GENERATION)
+target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps)
+add_dependencies(TTRuntimeTTNN TTRuntimeTTNNOps)
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
new file mode 100644
index 000000000..2fdc41485
--- /dev/null
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_TYPES_H
+#define TTNN_RUNTIME_TYPES_H
+
+#include "tt/runtime/detail/ttnn.h"
+
+namespace tt::runtime::ttnn {
+
+using DeviceMap = std::unordered_map<uint32_t, ::ttnn::Device *>;
+using TensorMap = std::unordered_map<uint32_t, ::ttnn::Tensor *>;
+struct ProgramTensorPool {
+  ProgramTensorPool(const TensorMap &liveTensors) : liveTensors(liveTensors) {}
+
+  auto try_emplace(std::uint32_t global_id, const ::ttnn::Tensor &tensor) {
+    auto it = liveTensors.find(global_id);
+    if (it != liveTensors.end()) {
+      return std::make_pair(it, false);
+    }
+    assert(!intermedTensors.contains(global_id));
+    intermedTensors.try_emplace(global_id, tensor);
+    return liveTensors.try_emplace(global_id, &intermedTensors.at(global_id));
+  }
+
+  auto insert_or_assign(std::uint32_t global_id, const ::ttnn::Tensor &tensor) {
+    intermedTensors.insert_or_assign(global_id, tensor);
+    return liveTensors.insert_or_assign(global_id,
+                                        &intermedTensors.at(global_id));
+  }
+
+  ::ttnn::Tensor &at(std::uint32_t global_id) {
+    assert(liveTensors.contains(global_id));
+    return *liveTensors.at(global_id);
+  }
+
+  size_t erase(std::uint32_t global_id) {
+    assert(liveTensors.contains(global_id) &&
+           intermedTensors.contains(global_id));
+    intermedTensors.erase(global_id);
+    return liveTensors.erase(global_id);
+  }
+
+  bool contains(std::uint32_t global_id) const {
+    return liveTensors.contains(global_id);
+  }
+
+private:
+  // A superset of intermedTensors, containing pointers to all tensors created
+  // by the program and the input/output tensors passed in by the user
+  TensorMap liveTensors;
+
+  // A subset of liveTensors, containing values of any intermediate tensors
+  // created by the program
+  std::unordered_map<std::uint32_t, ::ttnn::Tensor> intermedTensors;
+};
+
+struct ProgramContext {
+  ProgramTensorPool tensorPool;
+  DeviceMap allDevices;
+  DeviceMap devicePool;
+
+  ProgramContext(const TensorMap &liveTensors, const DeviceMap &allDevices)
+      : tensorPool(ProgramTensorPool(liveTensors)), allDevices(allDevices) {}
+};
+} // namespace tt::runtime::ttnn
+
+#endif
diff --git a/runtime/lib/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
similarity index 100%
rename from runtime/lib/ttnn/utils.h
rename to runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
new file mode 100644
index 000000000..70a6b962c
--- /dev/null
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -0,0 +1,38 @@
+set(TTNN_OPS_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/tt/runtime/ttnn/operations/utils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/conv/conv2d.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/creation/empty.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/creation/full.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/concat.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/reshape.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/data_movement/transpose.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/deletion/dealloc.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/binary.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/eltwise/unary.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/embedding/embedding.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_device.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_layout.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_memory_config.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/matmul/matmul.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/normalization/softmax.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/pool/maxpool2d.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/reduction/reduction.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/context/get_device.cpp
+)
+
+add_library(TTRuntimeTTNNOps
+  STATIC
+  ${TTNN_OPS_SRCS}
+)
+
+set_property(TARGET TTRuntimeTTNNOps PROPERTY CXX_STANDARD 20)
+target_compile_options(TTRuntimeTTNNOps PUBLIC -mavx -mavx2)
+target_include_directories(TTRuntimeTTNNOps PUBLIC
+  ${PROJECT_SOURCE_DIR}/runtime/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include
+  ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+target_include_directories(TTRuntimeTTNNOps PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY)
+add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION)
diff --git a/runtime/lib/ttnn/operations/context/get_device.cpp b/runtime/lib/ttnn/operations/context/get_device.cpp
new file mode 100644
index 000000000..36aa40be3
--- /dev/null
+++ b/runtime/lib/ttnn/operations/context/get_device.cpp
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "get_device.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::context {
+void run(const ::tt::target::ttnn::GetDeviceOp *op, ProgramContext &context) {
+  DeviceMap &devicePool = context.devicePool;
+  DeviceMap &allDevices = context.allDevices;
+  const flatbuffers::Vector<uint32_t> *chipIds = op->chip_ids();
+  assert(chipIds->size() == 1 && "Expected 1 chip id");
+  for (const uint32_t chipId : *chipIds) {
+    assert(allDevices.contains(chipId) && "Device not found");
+    auto [iter, inserted] =
+        devicePool.try_emplace(chipId, allDevices.at(chipId));
+    assert(inserted && "Duplicate device");
+  }
+}
+} // namespace tt::runtime::ttnn::operations::context
diff --git a/runtime/lib/ttnn/operations/context/get_device.h b/runtime/lib/ttnn/operations/context/get_device.h
new file mode 100644
index 000000000..1d9b11201
--- /dev/null
+++ b/runtime/lib/ttnn/operations/context/get_device.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_GET_DEVICE_H
+#define TTNN_RUNTIME_GET_DEVICE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::context {
+void run(const ::tt::target::ttnn::GetDeviceOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::context
+
+#endif
diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp
new file mode 100644
index 000000000..11bcb1697
--- /dev/null
+++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "conv2d.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::conv {
+void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
+  const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id());
+  std::optional<::ttnn::Tensor> bias =
+      op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id()))
+                 : std::nullopt;
+  auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
+  config.dtype = input.dtype();
+  config.weights_dtype = weight.dtype();
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::Tensor out =
+      std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>(
+          input, weight, &device, op->in_channels(), op->out_channels(),
+          op->batch_size(), op->input_height(), op->input_width(),
+          {op->kernel_height(), op->kernel_width()},
+          {op->stride_height(), op->stride_width()},
+          {op->padding_height(), op->padding_width()},
+          {op->dilation_height(), op->dilation_width()}, op->groups(), bias,
+          config));
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::conv
diff --git a/runtime/lib/ttnn/operations/conv/conv2d.h b/runtime/lib/ttnn/operations/conv/conv2d.h
new file mode 100644
index 000000000..3ce35f7af
--- /dev/null
+++ b/runtime/lib/ttnn/operations/conv/conv2d.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_CONV2D_H
+#define TTNN_RUNTIME_CONV2D_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::conv {
+void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::conv
+
+#endif
diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp
new file mode 100644
index 000000000..68bf6dc3f
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/empty.cpp
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "empty.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::creation {
+void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(op->out());
+  // TODO(bug #582): ttnn::empty doesn't work properly with tile layout,
+  // using ROW_MAJOR until we fix it
+  auto desiredLayout = ::ttnn::Layout::ROW_MAJOR;
+  auto shape = ::ttnn::Shape(
+      ::tt::tt_metal::Shape(::tt::runtime::ttnn::utils::toShapeFromFBShape(
+          *op->out()->desc()->shape())));
+
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::Tensor out =
+      ::ttnn::empty(shape, targetDataTypeTTNN, desiredLayout, device);
+  // use try emplace here so the program output tensor doesn't get overwritten
+  tensorPool.try_emplace(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/empty.h b/runtime/lib/ttnn/operations/creation/empty.h
new file mode 100644
index 000000000..ecc42a71e
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/empty.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_EMPTY_H
+#define TTNN_RUNTIME_EMPTY_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::creation {
+
+void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::creation
+
+#endif
diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp
new file mode 100644
index 000000000..22b22f0cc
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/full.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "full.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::creation {
+void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap devicePool = context.devicePool;
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::DataType outputDataType = utils::getDataType(op->out());
+  auto shape = ::ttnn::Shape(
+      ::tt::tt_metal::Shape(::tt::runtime::ttnn::utils::toShapeFromFBShape(
+          *op->out()->desc()->shape())));
+  float fillValue = op->fill_value();
+  // TODO(bug #272), determine correct layout by tile shape in the future
+  ::ttnn::Layout outputLayout = ::ttnn::Layout::ROW_MAJOR;
+  std::optional<std::reference_wrapper<::ttnn::Device>> outputDevice =
+      std::make_optional(std::ref(device));
+  std::optional<::tt::tt_metal::MemoryConfig> outputMemoryConfig =
+      std::make_optional(utils::createMemoryConfig(op->out()));
+
+  ::ttnn::Tensor out =
+      ::ttnn::full(shape, fillValue, outputDataType, outputLayout, outputDevice,
+                   outputMemoryConfig);
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/full.h b/runtime/lib/ttnn/operations/creation/full.h
new file mode 100644
index 000000000..2b2e003f6
--- /dev/null
+++ b/runtime/lib/ttnn/operations/creation/full.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_FULL_H
+#define TTNN_RUNTIME_FULL_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::creation {
+
+void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::creation
+
+#endif
diff --git a/runtime/lib/ttnn/operations/data_movement/concat.cpp b/runtime/lib/ttnn/operations/data_movement/concat.cpp
new file mode 100644
index 000000000..e904adc2a
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/concat.cpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "concat.h"
+#include "tt/runtime/detail/ttnn.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+void run(const ::tt::target::ttnn::ConcatOp *op, ProgramContext &context) {
+  std::vector<::ttnn::Tensor> inputs;
+  for (const auto &input : *op->inputs()) {
+    inputs.push_back(context.tensorPool.at(input->global_id()));
+  }
+  int32_t dim = op->dim();
+  ::ttnn::Tensor out = ::ttnn::concat(inputs, dim);
+  context.tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::data_movement
diff --git a/runtime/lib/ttnn/operations/data_movement/concat.h b/runtime/lib/ttnn/operations/data_movement/concat.h
new file mode 100644
index 000000000..396be6fb3
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/concat.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_CONCAT_H
+#define TTNN_RUNTIME_CONCAT_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+void run(const ::tt::target::ttnn::ConcatOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::data_movement
+
+#endif
diff --git a/runtime/lib/ttnn/operations/data_movement/reshape.cpp b/runtime/lib/ttnn/operations/data_movement/reshape.cpp
new file mode 100644
index 000000000..d91788687
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/reshape.cpp
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "reshape.h"
+#include "tt/runtime/detail/ttnn.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+
+template <int32_t Rank>
+static std::array<int32_t, Rank>
+vectorToArray(const std::vector<int32_t> &vec) {
+  if (vec.size() != Rank) {
+    throw std::invalid_argument("Vector size does not match array size");
+  }
+  std::array<int32_t, Rank> arr;
+  std::copy(vec.begin(), vec.end(), arr.begin());
+  return arr;
+}
+
+template <int32_t Rank>
+static ::ttnn::Tensor invoke_reshape(const ::ttnn::Tensor &tensor,
+                                     const std::vector<int32_t> &shape) {
+  return ::ttnn::reshape(tensor, vectorToArray<Rank>(shape));
+}
+
+void run(const ::tt::target::ttnn::ReshapeOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
+  const auto *fbShape = op->shape();
+  std::vector<int32_t> shape(fbShape->begin(), fbShape->end());
+  constexpr int32_t Rank1 = 1;
+  constexpr int32_t Rank2 = 2;
+  constexpr int32_t Rank3 = 3;
+  constexpr int32_t Rank4 = 4;
+  constexpr int32_t Rank5 = 5;
+
+  ::ttnn::Tensor out;
+  switch (fbShape->size()) {
+  case Rank1:
+    out = invoke_reshape<Rank1>(in, shape);
+    break;
+  case Rank2:
+    out = invoke_reshape<Rank2>(in, shape);
+    break;
+  case Rank3:
+    out = invoke_reshape<Rank3>(in, shape);
+    break;
+  case Rank4:
+    out = invoke_reshape<Rank4>(in, shape);
+    break;
+  case Rank5:
+    out = invoke_reshape<Rank5>(in, shape);
+    break;
+  default:
+    throw std::invalid_argument("Unsupported rank for reshape");
+  }
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::data_movement
diff --git a/runtime/lib/ttnn/operations/data_movement/reshape.h b/runtime/lib/ttnn/operations/data_movement/reshape.h
new file mode 100644
index 000000000..cfbd5ca9e
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/reshape.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_RESHAPE_H
+#define TTNN_RUNTIME_RESHAPE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+void run(const ::tt::target::ttnn::ReshapeOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::data_movement
+
+#endif
diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.cpp b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
new file mode 100644
index 000000000..fdfe5411b
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "transpose.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
+  int32_t dim0 = op->dim0();
+  int32_t dim1 = op->dim1();
+  auto inputRank = in.get_shape().rank();
+  // for the current version of permute, we need to work in 4D, so we add
+  // leading dimensions of size 1
+  std::vector<std::int64_t> dimensionOrder(4);
+  std::iota(dimensionOrder.begin(), dimensionOrder.end(), 0);
+  if (dim0 < 0) {
+    dim0 += 4;
+  } else {
+    dim0 = dim0 + 4 - inputRank;
+  }
+  if (dim1 < 0) {
+    dim1 += 4;
+  } else {
+    dim1 = dim1 + 4 - inputRank;
+  }
+  std::swap(dimensionOrder[dim0], dimensionOrder[dim1]);
+  // Ideally this would use ttnn::transpose, but since ttnn::transpose doesn't
+  // work at the moment, we use this temporary solution.
+  ::ttnn::Tensor unsqueezedInput = ::ttnn::unsqueeze_to_4D(in);
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+  ::ttnn::Tensor out =
+      ::ttnn::permute(unsqueezedInput, dimensionOrder, outputMemoryConfig);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::data_movement
diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.h b/runtime/lib/ttnn/operations/data_movement/transpose.h
new file mode 100644
index 000000000..27f562094
--- /dev/null
+++ b/runtime/lib/ttnn/operations/data_movement/transpose.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_TRANSPOSE_H
+#define TTNN_RUNTIME_TRANSPOSE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::data_movement {
+void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::data_movement
+
+#endif
diff --git a/runtime/lib/ttnn/operations/deletion/dealloc.cpp b/runtime/lib/ttnn/operations/deletion/dealloc.cpp
new file mode 100644
index 000000000..0fc992583
--- /dev/null
+++ b/runtime/lib/ttnn/operations/deletion/dealloc.cpp
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "dealloc.h"
+#include "tt/runtime/detail/ttnn.h"
+
+namespace tt::runtime::ttnn::operations::deletion {
+void run(const ::tt::target::ttnn::DeallocOp *op, ProgramContext &context) {
+  bool force = true;
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id());
+  tensor.deallocate(force);
+  tensorPool.erase(op->in()->global_id());
+}
+} // namespace tt::runtime::ttnn::operations::deletion
diff --git a/runtime/lib/ttnn/operations/deletion/dealloc.h b/runtime/lib/ttnn/operations/deletion/dealloc.h
new file mode 100644
index 000000000..f0b0ea786
--- /dev/null
+++ b/runtime/lib/ttnn/operations/deletion/dealloc.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_DEALLOC_H
+#define TTNN_RUNTIME_DEALLOC_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::deletion {
+void run(const ::tt::target::ttnn::DeallocOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::deletion
+
+#endif
diff --git a/runtime/lib/ttnn/operations/eltwise/binary.cpp b/runtime/lib/ttnn/operations/eltwise/binary.cpp
new file mode 100644
index 000000000..b6143d358
--- /dev/null
+++ b/runtime/lib/ttnn/operations/eltwise/binary.cpp
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "binary.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::binary {
+
+static void
+getEltwiseBinaryOPInputTensors(const ::tt::target::ttnn::EltwiseOp *op,
+                               ProgramTensorPool &tensorPool,
+                               ::ttnn::Tensor **lhs, ::ttnn::Tensor **rhs) {
+  assert(op->ins()->size() == 2 && "Expected 2 inputs");
+  *lhs = &(tensorPool.at(op->ins()->Get(0)->global_id()));
+  *rhs = &(tensorPool.at(op->ins()->Get(1)->global_id()));
+}
+
+static void runEltwiseBinaryOP(
+    const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
+    std::function<::ttnn::Tensor(
+        const ::ttnn::Tensor &, const ::ttnn::Tensor &,
+        const std::optional<const ::ttnn::DataType> &,
+        const std::optional<::tt::tt_metal::MemoryConfig> &,
+        std::optional<::ttnn::Tensor>,
+        std::optional<::ttnn::operations::unary::FusedActivations>,
+        std::optional<::ttnn::operations::unary::UnaryWithParam>)>
+        ttnnOp) {
+
+  ::ttnn::Tensor *lhs = nullptr;
+  ::ttnn::Tensor *rhs = nullptr;
+  getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs);
+
+  ::ttnn::DataType outputDataType = utils::getDataType(op->out());
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+
+  ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig,
+                              std::nullopt, std::nullopt, std::nullopt);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+
+static void runEltwiseBinaryCompositeOP(
+    const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
+    std::function<
+        ::ttnn::Tensor(const ::ttnn::Tensor &, const ::ttnn::Tensor &,
+                       const std::optional<::tt::tt_metal::MemoryConfig> &)>
+        ttnnOp) {
+
+  ::ttnn::Tensor *lhs = nullptr;
+  ::ttnn::Tensor *rhs = nullptr;
+  getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs);
+
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+
+  ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+
+void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
+  assert(isBinaryOp(op) && "Expected binary operation");
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  switch (op->type()) {
+  /* Eltwise Binary */
+  case ::tt::target::ttnn::EltwiseOpType::Add: {
+    runEltwiseBinaryOP(op, tensorPool, ::ttnn::add);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Multiply: {
+    runEltwiseBinaryOP(op, tensorPool, ::ttnn::multiply);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Subtract: {
+    runEltwiseBinaryOP(op, tensorPool, ::ttnn::subtract);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::GreaterEqual: {
+    runEltwiseBinaryOP(op, tensorPool, ::ttnn::ge);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Div: {
+    runEltwiseBinaryOP(op, tensorPool, ::ttnn::divide);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Maximum: {
+    runEltwiseBinaryCompositeOP(op, tensorPool, ::ttnn::maximum);
+    break;
+  }
+  default:
+    throw std::invalid_argument("Unsupported Eltwise Binary operation");
+  }
+}
+
+} // namespace tt::runtime::ttnn::operations::binary
diff --git a/runtime/lib/ttnn/operations/eltwise/binary.h b/runtime/lib/ttnn/operations/eltwise/binary.h
new file mode 100644
index 000000000..56af49033
--- /dev/null
+++ b/runtime/lib/ttnn/operations/eltwise/binary.h
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_ELTWISE_BINARY_H
+#define TTNN_RUNTIME_ELTWISE_BINARY_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::binary {
+
+inline bool isBinaryOp(const ::tt::target::ttnn::EltwiseOp *op) {
+  return op->ins()->size() == 2;
+}
+
+void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::binary
+
+#endif
diff --git a/runtime/lib/ttnn/operations/eltwise/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary.cpp
new file mode 100644
index 000000000..7646f542c
--- /dev/null
+++ b/runtime/lib/ttnn/operations/eltwise/unary.cpp
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "unary.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::unary {
+
+static void
+getEltwiseUnaryOPInputTensor(const ::tt::target::ttnn::EltwiseOp *op,
+                             ProgramTensorPool &tensorPool,
+                             ::ttnn::Tensor **in) {
+  assert(op->ins()->size() == 1 && "Expected 1 input");
+  *in = &(tensorPool.at(op->ins()->Get(0)->global_id()));
+}
+
+static void runEltwiseUnaryOP(
+    const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
+    std::function<
+        ::ttnn::Tensor(const ::ttnn::Tensor &,
+                       const std::optional<::tt::tt_metal::MemoryConfig> &,
+                       const std::optional<::ttnn::Tensor> &)>
+        ttnnOp) {
+
+  ::ttnn::Tensor *in = nullptr;
+  getEltwiseUnaryOPInputTensor(op, tensorPool, &in);
+
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+
+  ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+
+static void runEltwiseUnaryWithFastAndApproximateModeOP(
+    const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
+    std::function<
+        ::ttnn::Tensor(const ::ttnn::Tensor &, const bool,
+                       const std::optional<::tt::tt_metal::MemoryConfig> &,
+                       const std::optional<::ttnn::Tensor> &)>
+        ttnnOp) {
+
+  ::ttnn::Tensor *in = nullptr;
+  getEltwiseUnaryOPInputTensor(op, tensorPool, &in);
+
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+
+  ::ttnn::Tensor out =
+      ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+
+void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
+  assert(isUnaryOp(op) && "Expected binary operation");
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  switch (op->type()) {
+  case ::tt::target::ttnn::EltwiseOpType::Abs: {
+    runEltwiseUnaryOP(op, tensorPool, ::ttnn::abs);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Relu: {
+    runEltwiseUnaryOP(op, tensorPool, ::ttnn::relu);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Sqrt: {
+    runEltwiseUnaryOP(op, tensorPool, ::ttnn::sqrt);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Sigmoid: {
+    runEltwiseUnaryOP(op, tensorPool, ::ttnn::sigmoid);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Reciprocal: {
+    runEltwiseUnaryOP(op, tensorPool, ::ttnn::reciprocal);
+    break;
+  }
+  case ::tt::target::ttnn::EltwiseOpType::Exp: {
+    runEltwiseUnaryWithFastAndApproximateModeOP(op, tensorPool, ::ttnn::exp);
+    break;
+  }
+  default:
+    throw std::invalid_argument("Unsupported unary operation");
+  }
+}
+
+} // namespace tt::runtime::ttnn::operations::unary
diff --git a/runtime/lib/ttnn/operations/eltwise/unary.h b/runtime/lib/ttnn/operations/eltwise/unary.h
new file mode 100644
index 000000000..691e3ef67
--- /dev/null
+++ b/runtime/lib/ttnn/operations/eltwise/unary.h
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_ELTWISE_UNARY_H
+#define TTNN_RUNTIME_ELTWISE_UNARY_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::unary {
+
+inline bool isUnaryOp(const ::tt::target::ttnn::EltwiseOp *op) {
+  return op->ins()->size() == 1;
+}
+
+void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context);
+
+} // namespace tt::runtime::ttnn::operations::unary
+
+#endif
diff --git a/runtime/lib/ttnn/operations/embedding/embedding.cpp b/runtime/lib/ttnn/operations/embedding/embedding.cpp
new file mode 100644
index 000000000..433428eda
--- /dev/null
+++ b/runtime/lib/ttnn/operations/embedding/embedding.cpp
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "embedding.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::embedding {
+void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) {
+
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
+  const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id());
+  // default params for embedding op
+  std::optional<int> padToken = std::nullopt;
+  ::tt::tt_metal::Layout layout = ::ttnn::ROW_MAJOR_LAYOUT;
+  auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC;
+  ::ttnn::DataType outputDataType = utils::getDataType(op->output());
+  ::ttnn::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->output());
+  ::ttnn::Tensor out =
+      ::ttnn::embedding(input, weight, padToken, layout, embeddingsType,
+                        outputDataType, outputMemoryConfig);
+  tensorPool.insert_or_assign(op->output()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::embedding
diff --git a/runtime/lib/ttnn/operations/embedding/embedding.h b/runtime/lib/ttnn/operations/embedding/embedding.h
new file mode 100644
index 000000000..7c287bb50
--- /dev/null
+++ b/runtime/lib/ttnn/operations/embedding/embedding.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_EMBEDDING_H
+#define TTNN_RUNTIME_EMBEDDING_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::embedding {
+void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::embedding
+
+#endif
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
new file mode 100644
index 000000000..ce8173244
--- /dev/null
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "utils.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::utils {
+
+bool isOnHost(const ::ttnn::Tensor &tensor) {
+  // Currently only supports borrowed or owned host storage
+  return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
+         tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED;
+}
+
+bool isOnDevice(const ::ttnn::Tensor &tensor) {
+  // Currently only supports single device storage
+  return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE;
+}
+
+::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) {
+  return ::tt::runtime::ttnn::utils::toTTNNDataType(
+      tensorRef->desc()->layout()->memory_desc()->data_type());
+}
+::ttnn::Device &getDevice(const ::tt::target::DeviceRef *deviceRef,
+                          DeviceMap &devicePool) {
+  uint32_t deviceId = deviceRef->global_id();
+  assert(devicePool.contains(deviceId) && "Device not found in device pool");
+  return *devicePool.at(deviceId);
+}
+
+CoreRangeSet toCoreRangeSet(
+    const ::flatbuffers::Vector<const tt::target::Dim2dRange *> *coreRangeSet) {
+  std::set<CoreRange> coreRanges;
+  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
+    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
+    // End is inclusive
+    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
+                  coreRange->loc().y() + coreRange->size().y() - 1);
+
+    coreRanges.emplace(start, end);
+  }
+  return CoreRangeSet(coreRanges);
+}
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
+  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
+      layout->memory_desc()->memory_layout();
+  const ::tt::target::MemorySpace targetMemorySpace =
+      layout->memory_desc()->memory_space();
+  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
+      *targetCoreRangeSet = layout->core_range_set();
+  const ::flatbuffers::Vector<int32_t> *targetShardShape =
+      layout->memory_desc()->shape();
+
+  // TODO (jnie): Hardcoding to interleaved and block sharded for now
+  // Add support for other types once compiler supports it
+  assert(targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved ||
+         targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded);
+  assert(targetMemoryLayout != target::TensorMemoryLayout::BlockSharded ||
+         targetMemorySpace == target::MemorySpace::DeviceL1 &&
+             "Only L1 memory space supports sharded memory layout");
+  assert(targetCoreRangeSet->size() == 1 &&
+         "Currently only single core range/grid is supported");
+  assert(targetShardShape->size() == 2 &&
+         "Only 2D shard shape is supported in TTNN backend");
+
+  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
+  std::array<uint32_t, 2> ttnnShardShape;
+  std::copy(targetShardShape->begin(), targetShardShape->end(),
+            ttnnShardShape.begin());
+
+  if (targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded) {
+    assert(ttnnShardShape[0] % ::tt::constants::TILE_HEIGHT == 0 &&
+           ttnnShardShape[1] % ::tt::constants::TILE_WIDTH == 0 &&
+           "Shard shape must divide tile shape (32, 32) evenly");
+  }
+
+  ::tt::tt_metal::ShardSpec shardSpec(
+      ttnnCoreRangeSet, ttnnShardShape,
+      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
+
+  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
+      ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(targetMemoryLayout);
+
+  ::tt::tt_metal::BufferType ttnnBufferType =
+      ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace);
+
+  return {ttnnMemLayout, ttnnBufferType, shardSpec};
+}
+
+} // namespace tt::runtime::ttnn::operations::utils
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
new file mode 100644
index 000000000..a6b218536
--- /dev/null
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_OPERATIONS_UTILS_H
+#define TTNN_RUNTIME_OPERATIONS_UTILS_H
+
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+#include "types_generated.h"
+#include <cstdint>
+
+namespace tt::runtime::ttnn::operations::utils {
+
+bool isOnHost(const ::ttnn::Tensor &tensor);
+
+bool isOnDevice(const ::ttnn::Tensor &tensor);
+
+::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef);
+
+::ttnn::Device &getDevice(const ::tt::target::DeviceRef *deviceRef,
+                          DeviceMap &devicePool);
+
+CoreRangeSet toCoreRangeSet(
+    const ::flatbuffers::Vector<const tt::target::Dim2dRange *> *coreRangeSet);
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef);
+
+} // namespace tt::runtime::ttnn::operations::utils
+#endif
diff --git a/runtime/lib/ttnn/operations/layout/to_device.cpp b/runtime/lib/ttnn/operations/layout/to_device.cpp
new file mode 100644
index 000000000..f1ff3d44f
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_device.cpp
@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "to_device.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
+  assert((utils::isOnHost(inputTensor) or utils::isOnDevice(inputTensor)) &&
+         "Unsupported storage type");
+
+  ::ttnn::TensorMemoryLayout tensorMemoryLayout =
+      ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(
+          op->memcfg()->tensor_memory_layout());
+
+  ::ttnn::BufferType bufferType;
+  switch (op->memcfg()->buffer_type()) {
+  case ::tt::target::BufferType::DRAM:
+    bufferType = ::ttnn::BufferType::DRAM;
+    break;
+  case ::tt::target::BufferType::L1:
+    bufferType = ::ttnn::BufferType::L1;
+    break;
+  case ::tt::target::BufferType::SystemMemory:
+    bufferType = ::ttnn::BufferType::SYSTEM_MEMORY;
+    break;
+  case ::tt::target::BufferType::L1Small:
+    bufferType = ::ttnn::BufferType::L1_SMALL;
+    break;
+  case ::tt::target::BufferType::Trace:
+    bufferType = ::ttnn::BufferType::TRACE;
+    break;
+  }
+
+  // TODO(bug #620):
+  // Until ShardSpec support is added in TTNN, read it from the output tensor.
+  // If ShardSpec is not supplied, an error will be thrown in ttnn lib.
+  //
+  const ::tt::target::LayoutDesc *layout = op->out()->desc()->layout();
+  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
+      *targetCoreRangeSet = layout->core_range_set();
+  const ::flatbuffers::Vector<int32_t> *targetShardShape =
+      layout->memory_desc()->shape();
+  CoreRangeSet ttnnCoreRangeSet = utils::toCoreRangeSet(targetCoreRangeSet);
+  std::array<uint32_t, 2> ttnnShardShape;
+  std::copy(targetShardShape->begin(), targetShardShape->end(),
+            ttnnShardShape.begin());
+  ::tt::tt_metal::ShardSpec shardSpec(
+      ttnnCoreRangeSet, ttnnShardShape,
+      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
+
+  ::ttnn::MemoryConfig memoryConfig = {tensorMemoryLayout, bufferType,
+                                       shardSpec};
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::Tensor out = ::ttnn::to_device(inputTensor, &device, memoryConfig);
+
+  tensorPool.try_emplace(op->out()->global_id(), out);
+}
+
+} // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_device.h b/runtime/lib/ttnn/operations/layout/to_device.h
new file mode 100644
index 000000000..237211ed4
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_device.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_TO_DEVICE_H
+#define TTNN_RUNTIME_TO_DEVICE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::layout
+
+#endif
diff --git a/runtime/lib/ttnn/operations/layout/to_layout.cpp b/runtime/lib/ttnn/operations/layout/to_layout.cpp
new file mode 100644
index 000000000..adfeb8b43
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_layout.cpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "to_layout.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
+  assert((utils::isOnHost(inputTensor) or utils::isOnDevice(inputTensor)) &&
+         "Unsupported storage type");
+
+  ::ttnn::Layout layout;
+  switch (op->layout()) {
+  case ::tt::target::TensorLayout::RowMajor:
+    layout = ::ttnn::Layout::ROW_MAJOR;
+    break;
+  case ::tt::target::TensorLayout::Tile:
+    layout = ::ttnn::Layout::TILE;
+    break;
+  case ::tt::target::TensorLayout::Invalid:
+    layout = ::ttnn::Layout::INVALID;
+    break;
+  }
+
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::Tensor out = ::ttnn::to_layout(inputTensor, layout, std::nullopt,
+                                         std::nullopt, &device);
+
+  tensorPool.try_emplace(op->out()->global_id(), out);
+}
+
+} // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_layout.h b/runtime/lib/ttnn/operations/layout/to_layout.h
new file mode 100644
index 000000000..f397b47cc
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_layout.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_TO_LAYOUT_H
+#define TTNN_RUNTIME_TO_LAYOUT_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::layout
+
+#endif
diff --git a/runtime/lib/ttnn/operations/layout/to_memory_config.cpp b/runtime/lib/ttnn/operations/layout/to_memory_config.cpp
new file mode 100644
index 000000000..a24c16f08
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_memory_config.cpp
@@ -0,0 +1,217 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "to_memory_config.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+
+static ::ttnn::Tensor tilize(::ttnn::Tensor const &input) {
+  // NOLINTNEXTLINE
+  return ::ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt,
+                           std::nullopt,
+                           static_cast<::ttnn::Device *>(nullptr));
+}
+
+static ::ttnn::Tensor untilize(::ttnn::Tensor const &input) {
+  return ::ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt,
+                           std::nullopt,
+                           static_cast<::ttnn::Device *>(nullptr));
+}
+
+static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input,
+                                      const ::ttnn::DataType &targetDataType) {
+  if (utils::isOnHost(input)) {
+    return ::ttnn::to_dtype(input, targetDataType);
+  }
+
+  if (utils::isOnDevice(input)) {
+    if (input.get_layout() != ::ttnn::TILE_LAYOUT) {
+      // typecast op requires tilized tensor
+      ::ttnn::Tensor converted =
+          ::ttnn::typecast(tilize(input), targetDataType);
+      // untilize and return
+      return untilize(converted);
+    }
+    return ::ttnn::typecast(input, targetDataType);
+  }
+
+  throw std::runtime_error("Unsupported storage type");
+}
+
+/* TODO(bug #272), ideal flow is to determine tilize/untilize with
+ * tile_shape */
+static ::ttnn::Tensor
+updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
+                        const ::ttnn::DataType targetDataType,
+                        const bool shouldTilize, const bool shouldUntilize) {
+
+  ::ttnn::Tensor outputTensor = inputTensor;
+  const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType;
+  // const int targetTileX = targetTileShape->x();
+  // const int targetTileY = targetTileShape->y();
+  // const bool shouldTilize =
+  //     targetTileX == 32 and targetTileY == 32 and
+  //     inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT;
+  // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and
+  //                             inputTensor.get_layout() ==
+  //                             ::ttnn::TILE_LAYOUT;
+  assert(not(shouldTilize and shouldUntilize) &&
+         "Cannot tilize and untilize tensor at the same time");
+  if (shouldTilize) {
+    outputTensor = tilize(outputTensor);
+  } else if (shouldUntilize) {
+    outputTensor = untilize(outputTensor);
+  }
+  if (shouldConvertDataType) {
+    outputTensor = convertDataType(outputTensor, targetDataType);
+  }
+  return outputTensor;
+}
+
+static void
+handleToHostMemoryConfigOp(const ::ttnn::Tensor &inputTensor,
+                           const ::tt::target::TensorRef *outputTensorRef,
+                           ProgramTensorPool &tensorPool) {
+  ::ttnn::Tensor result;
+  ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef);
+  bool shouldTilize, shouldUntilize;
+  if (utils::isOnHost(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = true;
+    result = updateLayoutAndDataType(inputTensor, targetDataTypeTTNN,
+                                     shouldTilize, shouldUntilize);
+  } else if (utils::isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = true;
+    result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
+                                     shouldTilize, shouldUntilize);
+  }
+  // copy the output to the output tensor if it exists
+  if (tensorPool.contains(outputTensorRef->global_id())) {
+    ::ttnn::Tensor &outputTensor = tensorPool.at(outputTensorRef->global_id());
+    void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
+    void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
+    std::uint32_t size = result.volume() * result.element_size();
+    std::memcpy(dst, src, size);
+  } else {
+    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
+  }
+}
+
+static void
+handleToDramMemoryConfigOp(::ttnn::Device &device,
+                           const ::ttnn::Tensor &inputTensor,
+                           const ::tt::target::TensorRef *outputTensorRef,
+                           ProgramTensorPool &tensorPool) {
+  ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef);
+  ::tt::tt_metal::MemoryConfig targetMemoryConfig =
+      utils::createMemoryConfig(outputTensorRef);
+  bool shouldTilize, shouldUntilize;
+  if (utils::isOnHost(inputTensor)) {
+    ::ttnn::Tensor result = inputTensor;
+    shouldTilize = true;
+    shouldUntilize = false;
+    // device tilize requires BFLOAT16, if not then tilize on host
+    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+      result = tilize(result);
+      shouldTilize = false;
+    }
+    result = ::ttnn::to_device(result, &device, targetMemoryConfig);
+    result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                     shouldUntilize);
+    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
+  } else if (utils::isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = false;
+    ::ttnn::Tensor result = updateLayoutAndDataType(
+        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
+    result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
+    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
+  }
+}
+
+static void
+handleToL1MemoryConfigOp(::ttnn::Device &device,
+                         const ::ttnn::Tensor &inputTensor,
+                         const ::tt::target::TensorRef *outputTensorRef,
+                         ProgramTensorPool &tensorPool) {
+  ::ttnn::DataType targetDataTypeTTNN = utils::getDataType(outputTensorRef);
+  ::tt::tt_metal::MemoryConfig targetMemoryConfig =
+      utils::createMemoryConfig(outputTensorRef);
+  bool shouldTilize, shouldUntilize;
+  if (utils::isOnHost(inputTensor)) {
+    ::ttnn::Tensor result = inputTensor;
+    // device tilize requires BFLOAT16, if not then tilize on host
+    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+      result = tilize(result);
+      result = ::ttnn::to_device(result, &device, targetMemoryConfig);
+      shouldTilize = false;
+      shouldUntilize = false;
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       shouldUntilize);
+    } else {
+      shouldTilize = true;
+      shouldUntilize = false;
+      // device tilize op requires height sharded or interleaved tensors
+      // thus tilize first with default mem config, then convert memory config
+      result = ::ttnn::to_device(result, &device, std::nullopt);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       shouldUntilize);
+      result =
+          ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
+    }
+    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
+  } else if (utils::isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = false;
+    ::ttnn::Tensor result = updateLayoutAndDataType(
+        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
+    result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
+    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
+  }
+}
+
+// TODO(bug #272): right now hardcoding tilize/untilize, should determine with
+// tile shape blocked by issue #272
+void run(const ::tt::target::ttnn::ToMemoryConfigOp *op,
+         ProgramContext &context) {
+
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in0()->global_id());
+  assert(utils::isOnHost(inputTensor) or
+         utils::isOnDevice(inputTensor) && "Unsupported storage type");
+
+  const ::tt::target::Dim2d *targetTileShape =
+      op->out()->desc()->layout()->memory_desc()->tile_shape();
+  assert(::tt::runtime::ttnn::utils::isValidTileShape(targetTileShape) &&
+         "Invalid tile shape");
+
+  const ::tt::target::MemorySpace targetMemorySpace =
+      op->out()->desc()->layout()->memory_desc()->memory_space();
+
+  switch (targetMemorySpace) {
+  // This case should only be used when gathering outputs at the end of the
+  // program
+  case ::tt::target::MemorySpace::System:
+  case ::tt::target::MemorySpace::SystemMMIO: {
+    handleToHostMemoryConfigOp(inputTensor, op->out(), tensorPool);
+    break;
+  }
+  case ::tt::target::MemorySpace::DeviceDRAM: {
+    ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+    handleToDramMemoryConfigOp(device, inputTensor, op->out(), tensorPool);
+    break;
+  }
+  case ::tt::target::MemorySpace::DeviceL1: {
+    ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+    handleToL1MemoryConfigOp(device, inputTensor, op->out(), tensorPool);
+    break;
+  }
+  }
+}
+} // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_memory_config.h b/runtime/lib/ttnn/operations/layout/to_memory_config.h
new file mode 100644
index 000000000..269897981
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_memory_config.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_TO_MEMORY_CONFIG_H
+#define TTNN_RUNTIME_TO_MEMORY_CONFIG_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToMemoryConfigOp *op,
+         ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::layout
+
+#endif
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp
new file mode 100644
index 000000000..458cfe444
--- /dev/null
+++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "matmul.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::matmul {
+// ANCHOR: adding_an_op_matmul_runtime
+void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id());
+  const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id());
+  ::ttnn::DataType outputDataType = utils::getDataType(op->out());
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+  ::ttnn::Tensor out = ::ttnn::operations::matmul::matmul(
+      lhs, rhs, /*bias=*/std::nullopt,
+      ::ttnn::operations::matmul::Matmul{/*program_config=*/std::nullopt,
+                                         /*bcast_batch=*/std::nullopt,
+                                         outputMemoryConfig, outputDataType});
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+// ANCHOR_END: adding_an_op_matmul_runtime
+
+} // namespace tt::runtime::ttnn::operations::matmul
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.h b/runtime/lib/ttnn/operations/matmul/matmul.h
new file mode 100644
index 000000000..2db14d7b9
--- /dev/null
+++ b/runtime/lib/ttnn/operations/matmul/matmul.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_MATMUL_H
+#define TTNN_RUNTIME_MATMUL_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::matmul {
+void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::matmul
+
+#endif
diff --git a/runtime/lib/ttnn/operations/normalization/softmax.cpp b/runtime/lib/ttnn/operations/normalization/softmax.cpp
new file mode 100644
index 000000000..eba2323d4
--- /dev/null
+++ b/runtime/lib/ttnn/operations/normalization/softmax.cpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "softmax.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::normalization {
+void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
+  int32_t dimension = op->dimension();
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+  ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig);
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::normalization
diff --git a/runtime/lib/ttnn/operations/normalization/softmax.h b/runtime/lib/ttnn/operations/normalization/softmax.h
new file mode 100644
index 000000000..caf2cbc54
--- /dev/null
+++ b/runtime/lib/ttnn/operations/normalization/softmax.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_SOFTMAX_H
+#define TTNN_RUNTIME_SOFTMAX_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::normalization {
+void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::normalization
+
+#endif
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
new file mode 100644
index 000000000..742fad1bd
--- /dev/null
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "maxpool2d.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::pool {
+void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  DeviceMap &devicePool = context.devicePool;
+  const ::ttnn::Tensor &input = tensorPool.at(op->in()->global_id());
+  const ::ttnn::operations::pool::MaxPool2DOp operation =
+      ::ttnn::operations::pool::MaxPool2DOp();
+
+  ::ttnn::Device &device = utils::getDevice(op->device(), devicePool);
+  ::ttnn::Tensor out = operation.invoke(
+      0, input, op->batch_size(), op->input_height(), op->input_width(),
+      op->channels(), {op->kernel_height(), op->kernel_width()},
+      {op->stride_height(), op->stride_width()},
+      {op->padding_height(), op->padding_width()},
+      {op->dilation_height(), op->dilation_width()}, &device);
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+  return;
+}
+} // namespace tt::runtime::ttnn::operations::pool
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.h b/runtime/lib/ttnn/operations/pool/maxpool2d.h
new file mode 100644
index 000000000..f8746f817
--- /dev/null
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_MAXPOOL2D_H
+#define TTNN_RUNTIME_MAXPOOL2D_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::pool {
+void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::pool
+
+#endif
diff --git a/runtime/lib/ttnn/operations/reduction/reduction.cpp b/runtime/lib/ttnn/operations/reduction/reduction.cpp
new file mode 100644
index 000000000..88925e3d2
--- /dev/null
+++ b/runtime/lib/ttnn/operations/reduction/reduction.cpp
@@ -0,0 +1,52 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "reduction.h"
+#include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/operations/utils.h"
+
+namespace tt::runtime::ttnn::operations::reduction {
+static void runReductionOp(
+    ::tt::target::ttnn::ReductionOp const *op, ProgramTensorPool &tensorPool,
+    std::function<::ttnn::Tensor(
+        const ::ttnn::Tensor &,
+        const std::optional<std::variant<int, std::vector<int>>> &, const bool,
+        const std::optional<::tt::tt_metal::MemoryConfig> &,
+        const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)>
+        ttnnOp) {
+  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
+      utils::createMemoryConfig(op->out());
+  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
+
+  const auto *fbDimArg = op->dim_arg();
+  std::optional<vector<int>> dimArg =
+      fbDimArg ? std::make_optional(
+                     std::vector<int>(fbDimArg->begin(), fbDimArg->end()))
+               : std::nullopt;
+
+  ::ttnn::Tensor out = ttnnOp(
+      in, dimArg, op->keep_dim(), outputMemoryConfig /* memory_config_arg */,
+      std::nullopt /* compute_kernel_config */, 1.0f /* scalar */);
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+
+void run(const ::tt::target::ttnn::ReductionOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.tensorPool;
+  switch (op->type()) {
+  case ::tt::target::ttnn::ReductionOpType::Sum: {
+    runReductionOp(op, tensorPool, ::ttnn::sum);
+    break;
+  }
+  case ::tt::target::ttnn::ReductionOpType::Mean: {
+    runReductionOp(op, tensorPool, ::ttnn::mean);
+    break;
+  }
+  case ::tt::target::ttnn::ReductionOpType::Max: {
+    runReductionOp(op, tensorPool, ::ttnn::max);
+    break;
+  }
+  }
+}
+} // namespace tt::runtime::ttnn::operations::reduction
diff --git a/runtime/lib/ttnn/operations/reduction/reduction.h b/runtime/lib/ttnn/operations/reduction/reduction.h
new file mode 100644
index 000000000..9e9d668b8
--- /dev/null
+++ b/runtime/lib/ttnn/operations/reduction/reduction.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_REDUCTION_H
+#define TTNN_RUNTIME_REDUCTION_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::reduction {
+void run(const ::tt::target::ttnn::ReductionOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::reduction
+
+#endif
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 26a967a25..ef070ffe3 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -1,994 +1,102 @@
 // SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <list>
-#include <optional>
-#include <string>
-#include <unordered_map>
-
-#include "tt/runtime/detail/ttnn.h"
-#include "tt/runtime/runtime.h"
+#include "operations/context/get_device.h"
+#include "operations/conv/conv2d.h"
+#include "operations/creation/empty.h"
+#include "operations/creation/full.h"
+#include "operations/data_movement/concat.h"
+#include "operations/data_movement/reshape.h"
+#include "operations/data_movement/transpose.h"
+#include "operations/deletion/dealloc.h"
+#include "operations/eltwise/binary.h"
+#include "operations/eltwise/unary.h"
+#include "operations/embedding/embedding.h"
+#include "operations/layout/to_device.h"
+#include "operations/layout/to_layout.h"
+#include "operations/layout/to_memory_config.h"
+#include "operations/matmul/matmul.h"
+#include "operations/normalization/softmax.h"
+#include "operations/pool/maxpool2d.h"
+#include "operations/reduction/reduction.h"
+#include "tt/runtime/ttnn/types.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
-#include "ttnn/device.hpp"
-#include "ttnn/operations/conv/conv2d/conv2d.hpp"
-#include "ttnn/operations/core/core.hpp"
-#include "ttnn/operations/pool/maxpool/max_pool2d.hpp"
-#include "ttnn/tensor/tensor.hpp"
-#include "ttnn/tensor/types.hpp"
-#include "ttnn/types.hpp"
-#include "types_generated.h"
-#include "utils.h"
-
-#include "ttmlir/Target/TTNN/Target.h"
-#include "ttmlir/Version.h"
 
 namespace tt::runtime::ttnn {
 
-class ProgramTensorPool {
-public:
-  ProgramTensorPool(
-      const std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors)
-      : liveTensors(liveTensors) {}
+struct ProgramExecutor {
+  ProgramContext context;
+  ProgramExecutor(const TensorMap &liveTensors, const DeviceMap &allDevices)
+      : context(ProgramContext(liveTensors, allDevices)) {}
 
-  auto try_emplace(std::uint32_t global_id, const ::ttnn::Tensor &tensor) {
-    auto it = liveTensors.find(global_id);
-    if (it != liveTensors.end()) {
-      return std::make_pair(it, false);
+  void execute(const ::tt::target::ttnn::Program *program) {
+    for (const ::tt::target::ttnn::Operation *op : *program->operations()) {
+      runOperation(op);
     }
-    assert(!intermedTensors.contains(global_id));
-    intermedTensors.try_emplace(global_id, tensor);
-    return liveTensors.try_emplace(global_id, &intermedTensors.at(global_id));
-  }
-
-  auto insert_or_assign(std::uint32_t global_id, const ::ttnn::Tensor &tensor) {
-    intermedTensors.insert_or_assign(global_id, tensor);
-    return liveTensors.insert_or_assign(global_id,
-                                        &intermedTensors.at(global_id));
-  }
-
-  ::ttnn::Tensor &at(std::uint32_t global_id) {
-    assert(liveTensors.contains(global_id));
-    return *liveTensors.at(global_id);
-  }
-
-  size_t erase(std::uint32_t global_id) {
-    assert(liveTensors.contains(global_id) &&
-           intermedTensors.contains(global_id));
-    intermedTensors.erase(global_id);
-    return liveTensors.erase(global_id);
-  }
-
-  bool contains(std::uint32_t global_id) const {
-    return liveTensors.contains(global_id);
   }
 
 private:
-  // A superset of intermedTensors, containing all tensors created by the
-  // program and the input/output tensors passed in by the user
-  std::unordered_map<std::uint32_t, ::ttnn::Tensor *> liveTensors;
-
-  // A subset of liveTensors, containing any intermediate tensors created by the
-  // program
-  std::unordered_map<std::uint32_t, ::ttnn::Tensor> intermedTensors;
+  void runOperation(const ::tt::target::ttnn::Operation *op);
 };
 
-static bool isOnHost(const ::ttnn::Tensor &tensor) {
-  // Currently only supports borrowed or owned host storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
-         tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED;
-}
-
-static bool isOnDevice(const ::ttnn::Tensor &tensor) {
-  // Currently only supports single device storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE;
-}
-
-static ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) {
-  return utils::toTTNNDataType(
-      tensorRef->desc()->layout()->memory_desc()->data_type());
-}
-
-static ::ttnn::Device &
-getDevice(const ::tt::target::DeviceRef *deviceRef,
-          std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool) {
-  uint32_t deviceId = deviceRef->global_id();
-  assert(devicePool.contains(deviceId) && "Device not found in device pool");
-  return *devicePool.at(deviceId);
-}
-
-static CoreRangeSet toCoreRangeSet(
-    const ::flatbuffers::Vector<const tt::target::Dim2dRange *> *coreRangeSet) {
-  std::set<CoreRange> coreRanges;
-  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
-    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
-    // End is inclusive
-    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
-                  coreRange->loc().y() + coreRange->size().y() - 1);
-
-    coreRanges.emplace(start, end);
-  }
-  return CoreRangeSet(coreRanges);
-}
-
-static ::tt::tt_metal::MemoryConfig
-createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
-  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
-  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
-      layout->memory_desc()->memory_layout();
-  const ::tt::target::MemorySpace targetMemorySpace =
-      layout->memory_desc()->memory_space();
-  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
-      *targetCoreRangeSet = layout->core_range_set();
-  const ::flatbuffers::Vector<int32_t> *targetShardShape =
-      layout->memory_desc()->shape();
-
-  // TODO (jnie): Hardcoding to interleaved and block sharded for now
-  // Add support for other types once compiler supports it
-  assert(targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved ||
-         targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded);
-  assert(targetMemoryLayout != target::TensorMemoryLayout::BlockSharded ||
-         targetMemorySpace == target::MemorySpace::DeviceL1 &&
-             "Only L1 memory space supports sharded memory layout");
-  assert(targetCoreRangeSet->size() == 1 &&
-         "Currently only single core range/grid is supported");
-  assert(targetShardShape->size() == 2 &&
-         "Only 2D shard shape is supported in TTNN backend");
-
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
-  std::array<uint32_t, 2> ttnnShardShape;
-  std::copy(targetShardShape->begin(), targetShardShape->end(),
-            ttnnShardShape.begin());
-
-  if (targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded) {
-    assert(ttnnShardShape[0] % ::tt::constants::TILE_HEIGHT == 0 &&
-           ttnnShardShape[1] % ::tt::constants::TILE_WIDTH == 0 &&
-           "Shard shape must divide tile shape (32, 32) evenly");
-  }
-
-  ::tt::tt_metal::ShardSpec shardSpec(
-      ttnnCoreRangeSet, ttnnShardShape,
-      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
-
-  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
-      utils::toTTNNTensorMemoryLayout(targetMemoryLayout);
-
-  ::tt::tt_metal::BufferType ttnnBufferType =
-      utils::toTTNNBufferType(targetMemorySpace);
-
-  return {ttnnMemLayout, ttnnBufferType, shardSpec};
-}
-
-static ::ttnn::Tensor tilize(::ttnn::Tensor const &input) {
-  // NOLINTNEXTLINE
-  return ::ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt,
-                           std::nullopt,
-                           static_cast<::ttnn::Device *>(nullptr));
-}
-
-static ::ttnn::Tensor untilize(::ttnn::Tensor const &input) {
-  return ::ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt,
-                           std::nullopt,
-                           static_cast<::ttnn::Device *>(nullptr));
-}
-
-static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input,
-                                      const ::ttnn::DataType &targetDataType) {
-  if (isOnHost(input)) {
-    return ::ttnn::to_dtype(input, targetDataType);
-  }
-
-  if (isOnDevice(input)) {
-    if (input.get_layout() != ::ttnn::TILE_LAYOUT) {
-      // typecast op requires tilized tensor
-      ::ttnn::Tensor converted =
-          ::ttnn::typecast(tilize(input), targetDataType);
-      // untilize and return
-      return untilize(converted);
-    }
-    return ::ttnn::typecast(input, targetDataType);
-  }
-
-  throw std::runtime_error("Unsupported storage type");
-}
-
-/* TODO(bug #272), ideal flow is to determine tilize/untilize with
- * tile_shape */
-static ::ttnn::Tensor
-updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
-                        const ::ttnn::DataType targetDataType,
-                        const bool shouldTilize, const bool shouldUntilize) {
-
-  ::ttnn::Tensor outputTensor = inputTensor;
-  const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType;
-  // const int targetTileX = targetTileShape->x();
-  // const int targetTileY = targetTileShape->y();
-  // const bool shouldTilize =
-  //     targetTileX == 32 and targetTileY == 32 and
-  //     inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT;
-  // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and
-  //                             inputTensor.get_layout() ==
-  //                             ::ttnn::TILE_LAYOUT;
-  assert(not(shouldTilize and shouldUntilize) &&
-         "Cannot tilize and untilize tensor at the same time");
-  if (shouldTilize) {
-    outputTensor = tilize(outputTensor);
-  } else if (shouldUntilize) {
-    outputTensor = untilize(outputTensor);
-  }
-  if (shouldConvertDataType) {
-    outputTensor = convertDataType(outputTensor, targetDataType);
-  }
-  return outputTensor;
-}
-
-static void
-handleToHostMemoryConfigOp(const ::ttnn::Tensor &inputTensor,
-                           const ::tt::target::TensorRef *outputTensorRef,
-                           ProgramTensorPool &tensorPool) {
-  ::ttnn::Tensor result;
-  ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef);
-  bool shouldTilize, shouldUntilize;
-  if (isOnHost(inputTensor)) {
-    shouldTilize = false;
-    shouldUntilize = true;
-    result = updateLayoutAndDataType(inputTensor, targetDataTypeTTNN,
-                                     shouldTilize, shouldUntilize);
-  } else if (isOnDevice(inputTensor)) {
-    shouldTilize = false;
-    shouldUntilize = true;
-    result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
-                                     shouldTilize, shouldUntilize);
-  }
-  // copy the output to the output tensor if it exists
-  if (tensorPool.contains(outputTensorRef->global_id())) {
-    ::ttnn::Tensor &outputTensor = tensorPool.at(outputTensorRef->global_id());
-    void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
-    void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-    std::uint32_t size = result.volume() * result.element_size();
-    std::memcpy(dst, src, size);
-  } else {
-    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
-  }
-}
-
-static void
-handleToDramMemoryConfigOp(::ttnn::Device &device,
-                           const ::ttnn::Tensor &inputTensor,
-                           const ::tt::target::TensorRef *outputTensorRef,
-                           ProgramTensorPool &tensorPool) {
-  ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef);
-  ::tt::tt_metal::MemoryConfig targetMemoryConfig =
-      createMemoryConfig(outputTensorRef);
-  bool shouldTilize, shouldUntilize;
-  if (isOnHost(inputTensor)) {
-    ::ttnn::Tensor result = inputTensor;
-    shouldTilize = true;
-    shouldUntilize = false;
-    // device tilize requires BFLOAT16, if not then tilize on host
-    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
-      result = tilize(result);
-      shouldTilize = false;
-    }
-    result = ::ttnn::to_device(result, &device, targetMemoryConfig);
-    result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
-                                     shouldUntilize);
-    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
-  } else if (isOnDevice(inputTensor)) {
-    shouldTilize = false;
-    shouldUntilize = false;
-    ::ttnn::Tensor result = updateLayoutAndDataType(
-        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
-    result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
-    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
-  }
-}
-
-static void
-handleToL1MemoryConfigOp(::ttnn::Device &device,
-                         const ::ttnn::Tensor &inputTensor,
-                         const ::tt::target::TensorRef *outputTensorRef,
-                         ProgramTensorPool &tensorPool) {
-  ::ttnn::DataType targetDataTypeTTNN = getDataType(outputTensorRef);
-  ::tt::tt_metal::MemoryConfig targetMemoryConfig =
-      createMemoryConfig(outputTensorRef);
-  bool shouldTilize, shouldUntilize;
-  if (isOnHost(inputTensor)) {
-    ::ttnn::Tensor result = inputTensor;
-    // device tilize requires BFLOAT16, if not then tilize on host
-    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
-      result = tilize(result);
-      result = ::ttnn::to_device(result, &device, targetMemoryConfig);
-      shouldTilize = false;
-      shouldUntilize = false;
-      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
-                                       shouldUntilize);
-    } else {
-      shouldTilize = true;
-      shouldUntilize = false;
-      // device tilize op requires height sharded or interleaved tensors
-      // thus tilize first with default mem config, then convert memory config
-      result = ::ttnn::to_device(result, &device, std::nullopt);
-      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
-                                       shouldUntilize);
-      result =
-          ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
-    }
-    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
-  } else if (isOnDevice(inputTensor)) {
-    shouldTilize = false;
-    shouldUntilize = false;
-    ::ttnn::Tensor result = updateLayoutAndDataType(
-        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
-    result = ::ttnn::to_memory_config(result, targetMemoryConfig, std::nullopt);
-    tensorPool.insert_or_assign(outputTensorRef->global_id(), result);
-  }
-}
-
-// TODO(bug #272): right now hardcoding tilize/untilize, should determine with
-// tile shape blocked by issue #272
-static void run(::tt::target::ttnn::ToMemoryConfigOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-
-  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in0()->global_id());
-  assert(isOnHost(inputTensor) or
-         isOnDevice(inputTensor) && "Unsupported storage type");
-
-  const ::tt::target::Dim2d *targetTileShape =
-      op->out()->desc()->layout()->memory_desc()->tile_shape();
-  assert(utils::isValidTileShape(targetTileShape) && "Invalid tile shape");
-
-  const ::tt::target::MemorySpace targetMemorySpace =
-      op->out()->desc()->layout()->memory_desc()->memory_space();
-
-  switch (targetMemorySpace) {
-  // This case should only be used when gathering outputs at the end of the
-  // program
-  case ::tt::target::MemorySpace::System:
-  case ::tt::target::MemorySpace::SystemMMIO: {
-    handleToHostMemoryConfigOp(inputTensor, op->out(), tensorPool);
-    break;
-  }
-  case ::tt::target::MemorySpace::DeviceDRAM: {
-    ::ttnn::Device &device = getDevice(op->device(), devicePool);
-    handleToDramMemoryConfigOp(device, inputTensor, op->out(), tensorPool);
-    break;
-  }
-  case ::tt::target::MemorySpace::DeviceL1: {
-    ::ttnn::Device &device = getDevice(op->device(), devicePool);
-    handleToL1MemoryConfigOp(device, inputTensor, op->out(), tensorPool);
-    break;
-  }
-  }
-}
-
-static void run(::tt::target::ttnn::ToLayoutOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-
-  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
-  assert((isOnHost(inputTensor) or isOnDevice(inputTensor)) &&
-         "Unsupported storage type");
-
-  ::ttnn::Layout layout;
-  switch (op->layout()) {
-  case ::tt::target::TensorLayout::RowMajor:
-    layout = ::ttnn::Layout::ROW_MAJOR;
-    break;
-  case ::tt::target::TensorLayout::Tile:
-    layout = ::ttnn::Layout::TILE;
-    break;
-  case ::tt::target::TensorLayout::Invalid:
-    layout = ::ttnn::Layout::INVALID;
-    break;
-  }
-
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::Tensor out = ::ttnn::to_layout(inputTensor, layout, std::nullopt,
-                                         std::nullopt, &device);
-
-  tensorPool.try_emplace(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::ToDeviceOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-
-  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
-  assert((isOnHost(inputTensor) or isOnDevice(inputTensor)) &&
-         "Unsupported storage type");
-
-  op->memcfg()->tensor_memory_layout();
-  op->memcfg()->buffer_type();
-
-  ::ttnn::TensorMemoryLayout tensorMemoryLayout;
-  switch (op->memcfg()->tensor_memory_layout()) {
-  case ::tt::target::TensorMemoryLayout::Interleaved:
-    tensorMemoryLayout = ::ttnn::TensorMemoryLayout::INTERLEAVED;
-    break;
-  case ::tt::target::TensorMemoryLayout::SingleBank:
-    tensorMemoryLayout = ::ttnn::TensorMemoryLayout::SINGLE_BANK;
-    break;
-  case ::tt::target::TensorMemoryLayout::HeightSharded:
-    tensorMemoryLayout = ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED;
-    break;
-  case ::tt::target::TensorMemoryLayout::WidthSharded:
-    tensorMemoryLayout = ::ttnn::TensorMemoryLayout::WIDTH_SHARDED;
-    break;
-  case ::tt::target::TensorMemoryLayout::BlockSharded:
-    tensorMemoryLayout = ::ttnn::TensorMemoryLayout::BLOCK_SHARDED;
-    break;
-  case ::tt::target::TensorMemoryLayout::None:
-    assert(false &&
-           "Unsupported tensor memory layout TensorMemoryLayout::None");
-    break;
-  }
-
-  ::ttnn::BufferType bufferType;
-  switch (op->memcfg()->buffer_type()) {
-  case ::tt::target::BufferType::DRAM:
-    bufferType = ::ttnn::BufferType::DRAM;
-    break;
-  case ::tt::target::BufferType::L1:
-    bufferType = ::ttnn::BufferType::L1;
-    break;
-  case ::tt::target::BufferType::SystemMemory:
-    bufferType = ::ttnn::BufferType::SYSTEM_MEMORY;
-    break;
-  case ::tt::target::BufferType::L1Small:
-    bufferType = ::ttnn::BufferType::L1_SMALL;
-    break;
-  case ::tt::target::BufferType::Trace:
-    bufferType = ::ttnn::BufferType::TRACE;
-    break;
-  }
-
-  // TODO(bug #620):
-  // Until ShardSpec support is added in TTNN, read it from the output tensor.
-  // If ShardSpec is not supplied, an error will be thrown in ttnn lib.
-  //
-  const ::tt::target::LayoutDesc *layout = op->out()->desc()->layout();
-  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
-      *targetCoreRangeSet = layout->core_range_set();
-  const ::flatbuffers::Vector<int32_t> *targetShardShape =
-      layout->memory_desc()->shape();
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
-  std::array<uint32_t, 2> ttnnShardShape;
-  std::copy(targetShardShape->begin(), targetShardShape->end(),
-            ttnnShardShape.begin());
-  ::tt::tt_metal::ShardSpec shardSpec(
-      ttnnCoreRangeSet, ttnnShardShape,
-      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
-
-  ::ttnn::MemoryConfig memoryConfig = {tensorMemoryLayout, bufferType,
-                                       shardSpec};
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::Tensor out = ::ttnn::to_device(inputTensor, &device, memoryConfig);
-
-  tensorPool.try_emplace(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::EmptyOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  ::ttnn::DataType targetDataTypeTTNN = getDataType(op->out());
-  // TODO(bug #582): ttnn::empty doesn't work properly with tile layout,
-  // using ROW_MAJOR until we fix it
-  auto desiredLayout = ::ttnn::Layout::ROW_MAJOR;
-  auto shape = ::ttnn::Shape(::tt::tt_metal::Shape(
-      utils::toShapeFromFBShape(*op->out()->desc()->shape())));
-
-  // Create output memory config for the op
-  //
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::Tensor out = ::ttnn::empty(shape, targetDataTypeTTNN, desiredLayout,
-                                     device, outputMemoryConfig);
-
-  // use try emplace here so the program output tensor doesn't get overwritten
-  tensorPool.try_emplace(op->out()->global_id(), out);
-}
-
-static void
-getEltwiseBinaryOPInputTensors(::tt::target::ttnn::EltwiseOp const *op,
-                               ProgramTensorPool &tensorPool,
-                               ::ttnn::Tensor **lhs, ::ttnn::Tensor **rhs) {
-  assert(op->ins()->size() == 2 && "Expected 2 inputs");
-  *lhs = &(tensorPool.at(op->ins()->Get(0)->global_id()));
-  *rhs = &(tensorPool.at(op->ins()->Get(1)->global_id()));
-}
-
-static void runEltwiseBinaryOP(
-    ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool,
-    std::function<::ttnn::Tensor(
-        const ::ttnn::Tensor &, const ::ttnn::Tensor &,
-        const std::optional<const ::ttnn::DataType> &,
-        const std::optional<::tt::tt_metal::MemoryConfig> &,
-        std::optional<::ttnn::Tensor>,
-        std::optional<::ttnn::operations::unary::FusedActivations>,
-        std::optional<::ttnn::operations::unary::UnaryWithParam>)>
-        ttnnOp) {
-
-  ::ttnn::Tensor *lhs = nullptr;
-  ::ttnn::Tensor *rhs = nullptr;
-  getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs);
-
-  ::ttnn::DataType outputDataType = getDataType(op->out());
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-
-  ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig,
-                              std::nullopt, std::nullopt, std::nullopt);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void runEltwiseBinaryCompositeOP(
-    ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool,
-    std::function<
-        ::ttnn::Tensor(const ::ttnn::Tensor &, const ::ttnn::Tensor &,
-                       const std::optional<::tt::tt_metal::MemoryConfig> &)>
-        ttnnOp) {
-
-  ::ttnn::Tensor *lhs = nullptr;
-  ::ttnn::Tensor *rhs = nullptr;
-  getEltwiseBinaryOPInputTensors(op, tensorPool, &lhs, &rhs);
-
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-
-  ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void
-getEltwiseUnaryOPInputTensor(::tt::target::ttnn::EltwiseOp const *op,
-                             ProgramTensorPool &tensorPool,
-                             ::ttnn::Tensor **in) {
-  assert(op->ins()->size() == 1 && "Expected 1 input");
-  *in = &(tensorPool.at(op->ins()->Get(0)->global_id()));
-}
-
-static void runEltwiseUnaryOP(
-    ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool,
-    std::function<
-        ::ttnn::Tensor(const ::ttnn::Tensor &,
-                       const std::optional<::tt::tt_metal::MemoryConfig> &,
-                       const std::optional<::ttnn::Tensor> &)>
-        ttnnOp) {
-
-  ::ttnn::Tensor *in = nullptr;
-  getEltwiseUnaryOPInputTensor(op, tensorPool, &in);
-
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-
-  ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void runEltwiseUnaryWithFastAndApproximateModeOP(
-    ::tt::target::ttnn::EltwiseOp const *op, ProgramTensorPool &tensorPool,
-    std::function<
-        ::ttnn::Tensor(const ::ttnn::Tensor &, const bool,
-                       const std::optional<::tt::tt_metal::MemoryConfig> &,
-                       const std::optional<::ttnn::Tensor> &)>
-        ttnnOp) {
-
-  ::ttnn::Tensor *in = nullptr;
-  getEltwiseUnaryOPInputTensor(op, tensorPool, &in);
-
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-
-  ::ttnn::Tensor out =
-      ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::EltwiseOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  switch (op->type()) {
-  /* Eltwise Binary */
-  case ::tt::target::ttnn::EltwiseOpType::Add: {
-    runEltwiseBinaryOP(op, tensorPool, ::ttnn::add);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Multiply: {
-    runEltwiseBinaryOP(op, tensorPool, ::ttnn::multiply);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Subtract: {
-    runEltwiseBinaryOP(op, tensorPool, ::ttnn::subtract);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::GreaterEqual: {
-    runEltwiseBinaryOP(op, tensorPool, ::ttnn::ge);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Div: {
-    runEltwiseBinaryOP(op, tensorPool, ::ttnn::divide);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Maximum: {
-    runEltwiseBinaryCompositeOP(op, tensorPool, ::ttnn::maximum);
-    break;
-  }
-  /* Eltwise Unary */
-  case ::tt::target::ttnn::EltwiseOpType::Abs: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::abs);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Neg: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::neg);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Relu: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::relu);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Sqrt: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::sqrt);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Sigmoid: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::sigmoid);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Reciprocal: {
-    runEltwiseUnaryOP(op, tensorPool, ::ttnn::reciprocal);
-    break;
-  }
-  case ::tt::target::ttnn::EltwiseOpType::Exp: {
-    runEltwiseUnaryWithFastAndApproximateModeOP(op, tensorPool, ::ttnn::exp);
-    break;
-  }
-  }
-}
-
-static void runReductionOp(
-    ::tt::target::ttnn::ReductionOp const *op, ProgramTensorPool &tensorPool,
-    std::function<::ttnn::Tensor(
-        const ::ttnn::Tensor &,
-        const std::optional<std::variant<int, std::vector<int>>> &, const bool,
-        const std::optional<::tt::tt_metal::MemoryConfig> &,
-        const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)>
-        ttnnOp) {
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
-
-  const auto *fbDimArg = op->dim_arg();
-  std::optional<vector<int>> dimArg =
-      fbDimArg ? std::make_optional(
-                     std::vector<int>(fbDimArg->begin(), fbDimArg->end()))
-               : std::nullopt;
-
-  ::ttnn::Tensor out = ttnnOp(
-      in, dimArg, op->keep_dim(), outputMemoryConfig /* memory_config_arg */,
-      std::nullopt /* compute_kernel_config */, 1.0f /* scalar */);
-
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::ReductionOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  switch (op->type()) {
-  case ::tt::target::ttnn::ReductionOpType::Sum: {
-    runReductionOp(op, tensorPool, ::ttnn::sum);
-    break;
-  }
-  case ::tt::target::ttnn::ReductionOpType::Mean: {
-    runReductionOp(op, tensorPool, ::ttnn::mean);
-    break;
-  }
-  case ::tt::target::ttnn::ReductionOpType::Max: {
-    runReductionOp(op, tensorPool, ::ttnn::max);
-    break;
-  }
-  }
-}
-
-template <int32_t Rank>
-static std::array<int32_t, Rank>
-vectorToArray(const std::vector<int32_t> &vec) {
-  if (vec.size() != Rank) {
-    throw std::invalid_argument("Vector size does not match array size");
-  }
-  std::array<int32_t, Rank> arr;
-  std::copy(vec.begin(), vec.end(), arr.begin());
-  return arr;
-}
-
-template <int32_t Rank>
-static ::ttnn::Tensor invoke_reshape(const ::ttnn::Tensor &tensor,
-                                     const std::vector<int32_t> &shape) {
-  return ::ttnn::reshape(tensor, vectorToArray<Rank>(shape));
-}
-
-static void run(::tt::target::ttnn::ReshapeOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
-  const auto *fbShape = op->shape();
-  std::vector<int32_t> shape(fbShape->begin(), fbShape->end());
-  constexpr int32_t Rank1 = 1;
-  constexpr int32_t Rank2 = 2;
-  constexpr int32_t Rank3 = 3;
-  constexpr int32_t Rank4 = 4;
-  constexpr int32_t Rank5 = 5;
-
-  ::ttnn::Tensor out;
-  switch (fbShape->size()) {
-  case Rank1:
-    out = invoke_reshape<Rank1>(in, shape);
-    break;
-  case Rank2:
-    out = invoke_reshape<Rank2>(in, shape);
-    break;
-  case Rank3:
-    out = invoke_reshape<Rank3>(in, shape);
-    break;
-  case Rank4:
-    out = invoke_reshape<Rank4>(in, shape);
-    break;
-  case Rank5:
-    out = invoke_reshape<Rank5>(in, shape);
-    break;
-  default:
-    throw std::invalid_argument("Unsupported rank for reshape");
-  }
-
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::EmbeddingOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
-  const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id());
-  // default params for embedding op
-  std::optional<int> padToken = std::nullopt;
-  ::tt::tt_metal::Layout layout = ::ttnn::ROW_MAJOR_LAYOUT;
-  auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC;
-  ::ttnn::DataType outputDataType = getDataType(op->output());
-  ::ttnn::MemoryConfig outputMemoryConfig = createMemoryConfig(op->output());
-  ::ttnn::Tensor out =
-      ::ttnn::embedding(input, weight, padToken, layout, embeddingsType,
-                        outputDataType, outputMemoryConfig);
-  tensorPool.insert_or_assign(op->output()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::SoftmaxOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
-  int32_t dimension = op->dimension();
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-  ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::TransposeOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
-  int32_t dim0 = op->dim0();
-  int32_t dim1 = op->dim1();
-  auto inputRank = in.get_shape().rank();
-  // for the current version of permute, we need to work in 4D, so we add
-  // leading dimensions of size 1
-  std::vector<std::int64_t> dimensionOrder(4);
-  std::iota(dimensionOrder.begin(), dimensionOrder.end(), 0);
-  if (dim0 < 0) {
-    dim0 += 4;
-  } else {
-    dim0 = dim0 + 4 - inputRank;
-  }
-  if (dim1 < 0) {
-    dim1 += 4;
-  } else {
-    dim1 = dim1 + 4 - inputRank;
-  }
-  std::swap(dimensionOrder[dim0], dimensionOrder[dim1]);
-  // Ideally this would use ttnn::transpose, but since ttnn::transpose doesn't
-  // work at the moment, we use this temporary solution.
-  ::ttnn::Tensor unsqueezedInput = ::ttnn::unsqueeze_to_4D(in);
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-  ::ttnn::Tensor out =
-      ::ttnn::permute(unsqueezedInput, dimensionOrder, outputMemoryConfig);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void run(::tt::target::ttnn::ConcatOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  std::vector<::ttnn::Tensor> inputs;
-  for (const auto &input : *op->inputs()) {
-    inputs.push_back(tensorPool.at(input->global_id()));
-  }
-  int32_t dim = op->dim();
-  ::ttnn::Tensor out = ::ttnn::concat(inputs, dim);
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-// ANCHOR: adding_an_op_matmul_runtime
-static void run(::tt::target::ttnn::MatmulOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &lhs = tensorPool.at(op->in0()->global_id());
-  const ::ttnn::Tensor &rhs = tensorPool.at(op->in1()->global_id());
-  ::ttnn::DataType outputDataType = getDataType(op->out());
-  ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      createMemoryConfig(op->out());
-  ::ttnn::Tensor out = ::ttnn::operations::matmul::matmul(
-      lhs, rhs, /*bias=*/std::nullopt,
-      ::ttnn::operations::matmul::Matmul{/*program_config=*/std::nullopt,
-                                         /*bcast_batch=*/std::nullopt,
-                                         outputMemoryConfig, outputDataType});
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-// ANCHOR_END: adding_an_op_matmul_runtime
-
-static void run(::tt::target::ttnn::Conv2dOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &input = tensorPool.at(op->input()->global_id());
-  const ::ttnn::Tensor &weight = tensorPool.at(op->weight()->global_id());
-  std::optional<::ttnn::Tensor> bias =
-      op->bias() ? std::make_optional(tensorPool.at(op->bias()->global_id()))
-                 : std::nullopt;
-  auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
-  config.dtype = input.dtype();
-  config.weights_dtype = weight.dtype();
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::Tensor out =
-      std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>(
-          input, weight, &device, op->in_channels(), op->out_channels(),
-          op->batch_size(), op->input_height(), op->input_width(),
-          {op->kernel_height(), op->kernel_width()},
-          {op->stride_height(), op->stride_width()},
-          {op->padding_height(), op->padding_width()},
-          {op->dilation_height(), op->dilation_width()}, op->groups(), bias,
-          config));
-
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-  return;
-}
-
-static void run(::tt::target::ttnn::MaxPool2dOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  const ::ttnn::Tensor &input = tensorPool.at(op->in()->global_id());
-  const ::ttnn::operations::pool::MaxPool2DOp operation =
-      ::ttnn::operations::pool::MaxPool2DOp();
-
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::Tensor out = operation.invoke(
-      0, input, op->batch_size(), op->input_height(), op->input_width(),
-      op->channels(), {op->kernel_height(), op->kernel_width()},
-      {op->stride_height(), op->stride_width()},
-      {op->padding_height(), op->padding_width()},
-      {op->dilation_height(), op->dilation_width()}, &device);
-
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-  return;
-}
-
-static void run(::tt::target::ttnn::DeallocOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  bool force = true;
-  ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id());
-  tensor.deallocate(force);
-  tensorPool.erase(op->in()->global_id());
-}
-
-static void
-run(::tt::target::ttnn::GetDeviceOp const *op,
-    const std::unordered_map<uint32_t, ::ttnn::Device *> &allDevices,
-    std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-    ProgramTensorPool &tensorPool) {
-  const flatbuffers::Vector<uint32_t> *chipIds = op->chip_ids();
-  assert(chipIds->size() == 1 && "Expected 1 chip id");
-  for (const uint32_t chipId : *chipIds) {
-    assert(allDevices.contains(chipId) && "Device not found");
-    auto [iter, inserted] =
-        devicePool.try_emplace(chipId, allDevices.at(chipId));
-    assert(inserted && "Duplicate device");
-  }
-}
-
-static void run(::tt::target::ttnn::FullOp const *op,
-                std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-                ProgramTensorPool &tensorPool) {
-  ::ttnn::Device &device = getDevice(op->device(), devicePool);
-  ::ttnn::DataType outputDataType = getDataType(op->out());
-  auto shape = ::ttnn::Shape(::tt::tt_metal::Shape(
-      utils::toShapeFromFBShape(*op->out()->desc()->shape())));
-  float fillValue = op->fill_value();
-  // TODO(bug #272), determine correct layout by tile shape in the future
-  ::ttnn::Layout outputLayout = ::ttnn::Layout::ROW_MAJOR;
-  std::optional<std::reference_wrapper<::ttnn::Device>> outputDevice =
-      std::make_optional(std::ref(device));
-  std::optional<::tt::tt_metal::MemoryConfig> outputMemoryConfig =
-      std::make_optional(createMemoryConfig(op->out()));
-
-  ::ttnn::Tensor out =
-      ::ttnn::full(shape, fillValue, outputDataType, outputLayout, outputDevice,
-                   outputMemoryConfig);
-
-  tensorPool.insert_or_assign(op->out()->global_id(), out);
-}
-
-static void
-run(::tt::target::ttnn::Operation const *op,
-    const std::unordered_map<uint32_t, ::ttnn::Device *> &allDevices,
-    std::unordered_map<uint32_t, ::ttnn::Device *> &devicePool,
-    ProgramTensorPool &tensorPool) {
-
+void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
   switch (op->type_type()) {
   case ::tt::target::ttnn::OpType::GetDeviceOp: {
-    return run(op->type_as_GetDeviceOp(), allDevices, devicePool, tensorPool);
-    break;
+    return operations::context::run(op->type_as_GetDeviceOp(), context);
   }
   case ::tt::target::ttnn::OpType::ToMemoryConfigOp: {
-    return run(op->type_as_ToMemoryConfigOp(), devicePool, tensorPool);
+    return operations::layout::run(op->type_as_ToMemoryConfigOp(), context);
   }
   case ::tt::target::ttnn::OpType::ToLayoutOp: {
-    return run(op->type_as_ToLayoutOp(), devicePool, tensorPool);
+    return operations::layout::run(op->type_as_ToLayoutOp(), context);
   }
   case ::tt::target::ttnn::OpType::ToDeviceOp: {
-    return run(op->type_as_ToDeviceOp(), devicePool, tensorPool);
+    return operations::layout::run(op->type_as_ToDeviceOp(), context);
   }
   case ::tt::target::ttnn::OpType::EmptyOp: {
-    return run(op->type_as_EmptyOp(), devicePool, tensorPool);
+    return operations::creation::run(op->type_as_EmptyOp(), context);
   }
   case ::tt::target::ttnn::OpType::FullOp: {
-    return run(op->type_as_FullOp(), devicePool, tensorPool);
-    break;
+    return operations::creation::run(op->type_as_FullOp(), context);
   }
   case ::tt::target::ttnn::OpType::EltwiseOp: {
-    return run(op->type_as_EltwiseOp(), devicePool, tensorPool);
+    const ::tt::target::ttnn::EltwiseOp *eltwiseOp = op->type_as_EltwiseOp();
+    if (operations::unary::isUnaryOp(eltwiseOp)) {
+      return operations::unary::run(eltwiseOp, context);
+    }
+    assert(operations::binary::isBinaryOp(eltwiseOp) &&
+           "Eltwise op should be either unary or binary");
+    return operations::binary::run(eltwiseOp, context);
   }
   case ::tt::target::ttnn::OpType::MatmulOp: {
-    return run(op->type_as_MatmulOp(), devicePool, tensorPool);
+    return operations::matmul::run(op->type_as_MatmulOp(), context);
   }
   case ::tt::target::ttnn::OpType::ReductionOp: {
-    return run(op->type_as_ReductionOp(), devicePool, tensorPool);
+    return operations::reduction::run(op->type_as_ReductionOp(), context);
   }
   case ::tt::target::ttnn::OpType::EmbeddingOp: {
-    return run(op->type_as_EmbeddingOp(), devicePool, tensorPool);
+    return operations::embedding::run(op->type_as_EmbeddingOp(), context);
   }
   case ::tt::target::ttnn::OpType::SoftmaxOp: {
-    return run(op->type_as_SoftmaxOp(), devicePool, tensorPool);
+    return operations::normalization::run(op->type_as_SoftmaxOp(), context);
   }
   case ::tt::target::ttnn::OpType::TransposeOp: {
-    return run(op->type_as_TransposeOp(), devicePool, tensorPool);
-  }
-  case ::tt::target::ttnn::OpType::Conv2dOp: {
-    return run(op->type_as_Conv2dOp(), devicePool, tensorPool);
+    return operations::data_movement::run(op->type_as_TransposeOp(), context);
   }
   case ::tt::target::ttnn::OpType::ConcatOp: {
-    return run(op->type_as_ConcatOp(), devicePool, tensorPool);
+    return operations::data_movement::run(op->type_as_ConcatOp(), context);
   }
   case ::tt::target::ttnn::OpType::ReshapeOp: {
-    return run(op->type_as_ReshapeOp(), devicePool, tensorPool);
+    return operations::data_movement::run(op->type_as_ReshapeOp(), context);
+  }
+  case ::tt::target::ttnn::OpType::Conv2dOp: {
+    return operations::conv::run(op->type_as_Conv2dOp(), context);
   }
   case ::tt::target::ttnn::OpType::DeallocOp: {
-    return run(op->type_as_DeallocOp(), devicePool, tensorPool);
+    return operations::deletion::run(op->type_as_DeallocOp(), context);
   }
   case ::tt::target::ttnn::OpType::MaxPool2dOp: {
-    return run(op->type_as_MaxPool2dOp(), devicePool, tensorPool);
+    return operations::pool::run(op->type_as_MaxPool2dOp(), context);
   }
   default: {
     throw std::runtime_error("Unsupported operation type");
@@ -997,9 +105,9 @@ run(::tt::target::ttnn::Operation const *op,
 }
 
 // Nop is single input, output tensor where input is returned as output.
-bool handleNopProgram(::tt::target::ttnn::Program const *program,
-                      std::vector<::ttnn::Tensor *> const &inputs,
-                      std::vector<::ttnn::Tensor *> const &outputs) {
+static bool handleNopProgram(::tt::target::ttnn::Program const *program,
+                             std::vector<::ttnn::Tensor *> const &inputs,
+                             std::vector<::ttnn::Tensor *> const &outputs) {
 
   bool isNop = program->inputs()->size() == 1 &&
                program->outputs()->size() == 1 &&
@@ -1022,9 +130,8 @@ void runProgram(::ttnn::Device &device,
   if (handleNopProgram(program, inputs, outputs)) {
     return;
   }
-  std::unordered_map<std::uint32_t, ::ttnn::Tensor *> liveTensors;
-  std::unordered_map<std::uint32_t, ::ttnn::Device *> allDevices;
-  std::unordered_map<std::uint32_t, ::ttnn::Device *> devicePool;
+  TensorMap liveTensors;
+  DeviceMap allDevices;
   int inputIndex = 0;
   assert(program->inputs()->size() == inputs.size());
   // Assuming single device for now until we support multichip
@@ -1042,9 +149,8 @@ void runProgram(::ttnn::Device &device,
         liveTensors.try_emplace(output->global_id(), outputs[outputIndex++]);
     assert(inserted && "Duplicate output tensor");
   }
-  ProgramTensorPool tensorPool(liveTensors);
-  for (::tt::target::ttnn::Operation const *op : *program->operations()) {
-    run(op, allDevices, devicePool, tensorPool);
-  }
+  ProgramExecutor executor(liveTensors, allDevices);
+  executor.execute(program);
 }
+
 } // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index e1f786bc4..a535e5e2a 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "tt/runtime/runtime.h"
 #include "tt/runtime/detail/ttnn.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "tt/runtime/utils.h"
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Version.h"
-#include "utils.h"
 
 namespace tt::runtime::ttnn {
 
diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index 98ed63833..1e191f3dc 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -58,7 +58,7 @@
 linklibs = ["TTBinary", "TTRuntimeSysDesc"]
 if enable_ttnn:
     runlibs += ["_ttnn.so"]
-    linklibs += ["TTRuntimeTTNN", ":_ttnn.so"]
+    linklibs += ["TTRuntimeTTNN", "TTRuntimeTTNNOps", ":_ttnn.so"]
 
 if enable_ttmetal:
     runlibs += ["libtt_metal.so"]
@@ -179,6 +179,7 @@ def package_files(directory):
                 f"{src_dir}/build/runtime/lib",
                 f"{src_dir}/build/runtime/lib/common",
                 f"{src_dir}/build/runtime/lib/ttnn",
+                f"{src_dir}/build/runtime/lib/ttnn/operations",
                 f"{src_dir}/build/runtime/lib/ttmetal",
                 f"{toolchain}/lib",
                 f"{src_dir}/build/runtime/tools/python/ttrt/runtime",