From bf1d922039a0991b558ff0b6215f6c5e2352551f Mon Sep 17 00:00:00 2001
From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com>
Date: Fri, 30 Aug 2024 16:46:12 +0200
Subject: [PATCH 01/16] Adding `unsqueeze` op (#544)

Adding `unsqueeze` op
---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     | 20 ++++++++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp      | 47 ++++++++++++++++++-
 lib/Dialect/TTIR/IR/TTIROps.cpp               | 45 ++++++++++++++++++
 .../ttmlir/Dialect/TTNN/simple_unsqueeze.mlir | 10 ++++
 4 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 5479dcb2f7..fe85f9bb76 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -428,6 +428,26 @@ def TTIR_SqueezeOp : TTIR_DPSOp<"squeeze"> {
     let hasVerifier = 1;
 }
 
+def TTIR_UnsqueezeOp : TTIR_DPSOp<"unsqueeze"> {
+    let summary = "Unsqueeze op.";
+    let description = [{
+      Unsqueeze tensor.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         AnyRankedTensor:$output,
+                         SI32Attr:$dim,
+                         TT_OperandConstraintArrayAttr:$operand_constraints);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
 // ANCHOR: adding_an_op_matmul_ttir
 def TTIR_MatmulOp : TTIR_DPSOp<"matmul"> {
     let summary = "Matrix multiply operation.";
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 1818898408..e8cb8cd287 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -193,7 +193,7 @@ class SqueezeOpConversionPattern : public OpConversionPattern<ttir::SqueezeOp> {
   LogicalResult
   matchAndRewrite(ttir::SqueezeOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // Extract input tensor types
+    // Extract input tensor type
     ::mlir::RankedTensorType inputType =
         mlir::cast<::mlir::RankedTensorType>(adaptor.getInput().getType());
 
@@ -224,6 +224,50 @@ class SqueezeOpConversionPattern : public OpConversionPattern<ttir::SqueezeOp> {
   }
 };
 
+class UnsqueezeOpConversionPattern
+    : public OpConversionPattern<ttir::UnsqueezeOp> {
+public:
+  using OpConversionPattern<ttir::UnsqueezeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::UnsqueezeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Extract input tensor type
+    ::mlir::RankedTensorType inputType =
+        mlir::cast<::mlir::RankedTensorType>(adaptor.getInput().getType());
+
+    // Get the unsqueeze dimension
+    int32_t dim = adaptor.getDim();
+
+    // Convert negative dim to its positive equivalent
+    if (dim < 0) {
+      dim += inputType.getRank() + 1;
+    }
+
+    // Get the shape of the input tensor
+    auto inputShape = inputType.getShape();
+    llvm::SmallVector<int32_t, 5> newShape;
+
+    // Insert the new dimension
+    for (int i = 0; i < inputType.getRank(); ++i) {
+      if (i == dim) {
+        newShape.push_back(1);
+      }
+      newShape.push_back(inputShape[i]);
+    }
+
+    // Create the new shape attribute
+    auto shapeAttr = rewriter.getI32ArrayAttr(newShape);
+
+    // Replace the UnsqueezeOp with a ReshapeOp
+    rewriter.replaceOpWithNewOp<ttnn::ReshapeOp>(
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getOutput(), shapeAttr);
+
+    return success();
+  }
+};
+
 } // namespace
 
 // ANCHOR: adding_an_op_matmul_op_rewriter
@@ -269,6 +313,7 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            ConcatOpConversionPattern,
            ReshapeOpConversionPattern,
            SqueezeOpConversionPattern,
+           UnsqueezeOpConversionPattern,
            MatmulOpConversionPattern
            >(typeConverter, ctx);
   // ANCHOR_END: op_rewriter_pattern_set
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 721928aead..85255e497e 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -305,6 +305,51 @@ ::mlir::LogicalResult mlir::tt::ttir::SqueezeOp::verify() {
   return success();
 }
 
+::mlir::LogicalResult mlir::tt::ttir::UnsqueezeOp::verify() {
+  ::mlir::RankedTensorType inputType = getInput().getType();
+  ::mlir::RankedTensorType outputType = getOutput().getType();
+  int32_t dim = getDim();
+
+  // Convert negative dim to its positive equivalent
+  if (dim < 0) {
+    dim += inputType.getRank() + 1;
+  }
+
+  // Check that the dim is within the bounds of the input tensor
+  if (dim > inputType.getRank() || dim < 0) {
+    return emitOpError(
+        "Dimension attribute must be within the bounds of the input tensor");
+  }
+
+  // Check that the output tensor has one more dimension than the input tensor
+  if (outputType.getRank() != inputType.getRank() + 1) {
+    return emitOpError(
+        "Output tensor must have one more dimension than the input tensor");
+  }
+
+  // and that the dimension added is of size 1
+  if (outputType.getDimSize(dim) != 1) {
+    return emitOpError("Dimension added must be of size 1");
+  }
+
+  // All dimensions of the input tensor must be the same as the output tensor
+  // except for the dimension added
+  for (int64_t i = 0, j = 0; i < outputType.getRank(); ++i) {
+    if (i == dim) {
+      continue;
+    }
+
+    if (inputType.getDimSize(j) != outputType.getDimSize(i)) {
+      return emitOpError("All dimensions of the input tensor must be the same "
+                         "as the output tensor except for the dimension added");
+    }
+
+    j++;
+  }
+
+  return success();
+}
+
 // ANCHOR: adding_an_op_matmul_ttir_verify
 ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() {
   ::mlir::RankedTensorType inputAType = getA().getType();
diff --git a/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir b/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir
new file mode 100644
index 0000000000..2400b6b5ed
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/simple_unsqueeze.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s| FileCheck %s
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<4x2x32x32xbf16>) -> tensor<4x1x2x32x32xbf16> {
+    %0 = tensor.empty() : tensor<4x1x2x32x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
+    %1 = "ttir.unsqueeze"(%arg0, %0) <{dim = -4 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<4x2x32x32xbf16>, tensor<4x1x2x32x32xbf16>) -> tensor<4x1x2x32x32xbf16>
+    return %1 : tensor<4x1x2x32x32xbf16>
+  }
+}

From 6595400dc22572c4e852b6e3bb61e265b84f4a0c Mon Sep 17 00:00:00 2001
From: Lewis Panos <lpanos@tenstorrent.com>
Date: Fri, 30 Aug 2024 10:54:07 -0400
Subject: [PATCH 02/16] Bringup E2E conv2d op (#484)

---
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td  | 31 ++++++++++
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td  | 34 +++++++++++
 include/ttmlir/Target/TTNN/program.fbs     | 22 +++++++
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp   | 68 +++++++++++++++++++++-
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp |  4 ++
 lib/Dialect/TTIR/IR/TTIROps.cpp            | 19 ++++++
 lib/Dialect/TTIR/Transforms/Passes.cpp     |  5 ++
 lib/Dialect/TTNN/IR/TTNNOps.cpp            | 24 ++++++++
 lib/Target/TTNN/TTNNToFlatbuffer.cpp       | 25 ++++++++
 runtime/include/tt/runtime/detail/ttnn.h   |  5 ++
 runtime/lib/ttnn/program.cpp               | 34 +++++++++++
 runtime/lib/ttnn/runtime.cpp               |  2 +-
 test/ttmlir/Dialect/TTNN/simple_conv.mlir  | 10 ++++
 13 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 test/ttmlir/Dialect/TTNN/simple_conv.mlir

diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index fe85f9bb76..edfee781a3 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -377,6 +377,37 @@ def TTIR_ConcatOp : TTIR_DPSOp<"concat"> {
     let arguments = (ins Variadic<AnyRankedTensor>:$inputs,
                          AnyRankedTensor:$output,
                          SI32Attr:$dim,
+
+    TT_OperandConstraintArrayAttr:$operand_constraints);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
+def TTIR_Conv2dOp : TTIR_DPSOp<"conv2d"> {
+    let summary = "Conv2d operation.";
+    let description = [{
+     Applies a 2D convolution over an input image composed of several input planes.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         AnyRankedTensor:$weight,
+                         Optional<AnyRankedTensor>:$bias,
+                         AnyRankedTensor:$output,
+                         SI32Attr:$stride_height,
+                         SI32Attr:$stride_width,
+                         SI32Attr:$dilation_height,
+                         SI32Attr:$dilation_width,
+                         SI32Attr:$groups,
+                         SI32Attr:$padding_left,
+                         SI32Attr:$padding_right,
+                         SI32Attr:$padding_top,
+                         SI32Attr:$padding_bottom,
                          TT_OperandConstraintArrayAttr:$operand_constraints);
 
     let results = (outs AnyRankedTensor:$result);
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 08a6a38701..380d798bbf 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -304,6 +304,40 @@ def TTNN_MatmulOp : TTNN_NamedDPSOp<"matmul"> {
 }
 // ANCHOR_END: adding_an_op_matmul_ttnn
 
+def TTNN_Conv2dOp : TTNN_NamedDPSOp<"conv2d"> {
+    let summary = "Conv2d operation.";
+    let description = [{
+      Applies a 2D convolution over an input image composed of several input planes.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         AnyRankedTensor:$weight,
+                         Optional<AnyRankedTensor>:$bias,
+                         AnyRankedTensor:$output,
+                         I32Attr:$in_channels,
+                         I32Attr:$out_channels,
+                         I32Attr:$batch_size,
+                         I32Attr:$input_height,
+                         I32Attr:$input_width,
+                         I32Attr:$kernel_height,
+                         I32Attr:$kernel_width,
+                         I32Attr:$stride_height,
+                         I32Attr:$stride_width,
+                         I32Attr:$padding_height,
+                         I32Attr:$padding_width,
+                         I32Attr:$dilation_height,
+                         I32Attr:$dilation_width,
+                         I32Attr:$groups);
+
+    let results = (outs AnyRankedTensor:$result);
+
+    let extraClassDeclaration = [{
+      MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
+    }];
+
+    let hasVerifier = 1;
+}
+
 def TTNN_EmptyOp : TTNN_Op<"empty"> {
     let summary = "Empty op.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index b163e2a13c..954f509a74 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -100,6 +100,27 @@ table MatmulOp {
 }
 // ANCHOR_END: adding_an_op_matmul_fbs
 
+table Conv2dOp {
+  input: tt.target.TensorRef;
+  weight: tt.target.TensorRef;
+  bias: tt.target.TensorRef;
+  out: tt.target.TensorRef;
+  in_channels: uint32;
+  out_channels: uint32;
+  batch_size: uint32;
+  input_height: uint32;
+  input_width: uint32;
+  kernel_height: uint32;
+  kernel_width: uint32;
+  stride_height: uint32;
+  stride_width: uint32;
+  padding_height: uint32;
+  padding_width: uint32;
+  dilation_height: uint32;
+  dilation_width: uint32;
+  groups: uint32;
+}
+
 union OpType {
   OpenDeviceOp,
   CloseDeviceOp,
@@ -112,6 +133,7 @@ union OpType {
   EmbeddingOp,
   SoftmaxOp,
   TransposeOp,
+  Conv2dOp,
   ConcatOp,
   ReshapeOp
 }
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index e8cb8cd287..6f51eaa580 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -286,6 +286,71 @@ class MatmulOpConversionPattern : public OpConversionPattern<ttir::MatmulOp> {
 };
 // ANCHOR_END: adding_an_op_matmul_op_rewriter
 
+class Conv2dOpConversionPattern : public OpConversionPattern<ttir::Conv2dOp> {
+public:
+  using OpConversionPattern<ttir::Conv2dOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::Conv2dOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto kernel_ty =
+        mlir::cast<RankedTensorType>(adaptor.getWeight().getType());
+    llvm::ArrayRef<std::int64_t> kernel_shape = kernel_ty.getShape();
+
+    auto input_ty = mlir::cast<RankedTensorType>(adaptor.getInput().getType());
+    llvm::ArrayRef<std::int64_t> input_shape = input_ty.getShape();
+
+    auto output_ty =
+        mlir::cast<RankedTensorType>(adaptor.getOutput().getType());
+    llvm::ArrayRef<std::int64_t> output_shape = output_ty.getShape();
+
+    auto in_channels =
+        rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 1]);
+    auto out_channels =
+        rewriter.getI32IntegerAttr(output_shape[output_shape.size() - 1]);
+    auto batch_size =
+        rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 4]);
+    auto input_height =
+        rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 3]);
+    auto input_width =
+        rewriter.getI32IntegerAttr(input_shape[input_shape.size() - 2]);
+
+    auto kernel_height =
+        rewriter.getI32IntegerAttr(kernel_shape[kernel_shape.size() - 2]);
+    auto kernel_width =
+        rewriter.getI32IntegerAttr(kernel_shape[kernel_shape.size() - 1]);
+
+    auto stride_height = rewriter.getI32IntegerAttr(adaptor.getStrideHeight());
+    auto stride_width = rewriter.getI32IntegerAttr(adaptor.getStrideWidth());
+
+    assert(
+        adaptor.getPaddingBottom() == adaptor.getPaddingTop() &&
+        "TTNN only supports padding height/width attributes. Thus, padding_top "
+        "must equal padding_bottom for the op to execute as expected.");
+    assert(adaptor.getPaddingLeft() == adaptor.getPaddingRight() &&
+           "TTNN only supports padding height/width attributes. Thus, "
+           "padding_left must equal padding_right for the op to execute as "
+           "expected.");
+    auto padding_height = rewriter.getI32IntegerAttr(adaptor.getPaddingTop());
+    auto padding_width = rewriter.getI32IntegerAttr(adaptor.getPaddingRight());
+
+    auto dilation_height =
+        rewriter.getI32IntegerAttr(adaptor.getDilationHeight());
+    auto dilation_width =
+        rewriter.getI32IntegerAttr(adaptor.getDilationWidth());
+    auto groups = rewriter.getI32IntegerAttr(adaptor.getGroups());
+
+    rewriter.replaceOpWithNewOp<ttnn::Conv2dOp>(
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getWeight(), adaptor.getBias(),
+        adaptor.getOutput(), in_channels, out_channels, batch_size, input_width,
+        input_height, kernel_height, kernel_width, stride_height, stride_width,
+        padding_height, padding_width, dilation_height, dilation_width, groups);
+    return success();
+  }
+};
+
 namespace mlir::tt {
 
 void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
@@ -314,7 +379,8 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
            ReshapeOpConversionPattern,
            SqueezeOpConversionPattern,
            UnsqueezeOpConversionPattern,
-           MatmulOpConversionPattern
+           MatmulOpConversionPattern,
+           Conv2dOpConversionPattern
            >(typeConverter, ctx);
   // ANCHOR_END: op_rewriter_pattern_set
   // clang-format on
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 88533cac48..1aee94c81b 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -203,6 +203,10 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   patterns.add<DefaultOpConversionPattern<ttnn::SumOp>>(typeConverter, ctx);
   patterns.add<DefaultOpConversionPattern<ttnn::MeanOp>>(typeConverter, ctx);
 
+  // Conv ops
+  //
+  patterns.add<DefaultOpConversionPattern<ttnn::Conv2dOp>>(typeConverter, ctx);
+
   // Other ops
   //
   patterns.add<DefaultOpConversionPattern<ttnn::SoftmaxOp>>(typeConverter, ctx);
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 85255e497e..49d2a97429 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -387,6 +387,25 @@ ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() {
 }
 // ANCHOR_END: adding_an_op_matmul_ttir_verify
 
+::mlir::LogicalResult mlir::tt::ttir::Conv2dOp::verify() {
+  ::mlir::RankedTensorType inputType = getInput().getType();
+  ::mlir::RankedTensorType weightType = getWeight().getType();
+  ::mlir::RankedTensorType biasType =
+      llvm::dyn_cast_or_null<::mlir::RankedTensorType>(getBias().getType());
+  if (inputType.getRank() < 3) {
+    return emitOpError("Input must be at least a 3D tensor");
+  }
+  if (weightType.getRank() != 4) {
+    return emitOpError("Weight must be a 4D tensor");
+  }
+  if (biasType) {
+    if (biasType.getRank() != 4) {
+      return emitOpError("Bias must be a 4D tensor");
+    }
+  }
+  return success();
+}
+
 ::mlir::LogicalResult mlir::tt::ttir::AllocOp::verify() {
   auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
       getResult().getType().getEncoding());
diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp
index aca4598a5c..bf303f65e5 100644
--- a/lib/Dialect/TTIR/Transforms/Passes.cpp
+++ b/lib/Dialect/TTIR/Transforms/Passes.cpp
@@ -572,6 +572,11 @@ class TTIRLayoutDPSOperandsRewriter
     for (auto &operand : op->getOpOperands()) {
       bool isResult = op.isDpsInit(&operand);
 
+      // TTNN Conv2d moves input, weight, and bias from host to device
+      // itself. Inserting the ToLayoutOp on these operands is thus problematic.
+      if (mlir::isa<Conv2dOp>(op.getOperation()) && !isResult) {
+        continue;
+      }
       auto operandConstraint =
           mlir::cast<OperandConstraintAttr>(
               mlir::cast<TTIROp>(op.getOperation())
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 00dc77dce5..81d3c5c8b3 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -248,6 +248,30 @@ ::mlir::LogicalResult mlir::tt::ttnn::MatmulOp::verify() {
 }
 // ANCHOR_END: adding_an_op_matmul_ttnn_verify
 
+::mlir::LogicalResult mlir::tt::ttnn::Conv2dOp::verify() {
+  ::mlir::RankedTensorType inputType = getInput().getType();
+  ::mlir::RankedTensorType weightType = getWeight().getType();
+  ::mlir::RankedTensorType biasType =
+      llvm::dyn_cast_or_null<::mlir::RankedTensorType>(getBias().getType());
+
+  if (inputType.getRank() < 3) {
+    return emitOpError("Input must be at least a 3D tensor");
+  }
+  if (weightType.getRank() != 4) {
+    return emitOpError("Weight must be a 4D tensor");
+  }
+  if (biasType) {
+    if (biasType.getRank() != 4) {
+      return emitOpError("Bias must be a 4D tensor");
+    }
+    auto biasShape = biasType.getShape();
+    if (biasShape[0] != 1 || biasShape[1] != 1 || biasShape[2] != 1) {
+      return emitOpError("Bias must only have data on the final dimenstion");
+    }
+  }
+  return success();
+}
+
 ::mlir::LogicalResult AllocOp::verify() {
   auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
       getResult().getType().getEncoding());
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 07b9825742..2b0ce61233 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -26,6 +26,7 @@
 #include "ttmlir/Target/Utils/FuncOpToProgram.h"
 #include "ttmlir/Target/Utils/MLIRToFlatbuffer.h"
 #include "ttmlir/Version.h"
+#include "types_generated.h"
 
 namespace mlir::tt::ttnn {
 
@@ -121,6 +122,27 @@ createOp(FlatbufferObjectCache &cache, MatmulOp op) {
 }
 // ANCHOR_END: adding_an_op_matmul_serialize_to_binary
 
+::flatbuffers::Offset<::tt::target::ttnn::Conv2dOp>
+createOp(FlatbufferObjectCache &cache, Conv2dOp op) {
+  auto in0 =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput()));
+  auto in1 = cache.at<::tt::target::TensorRef>(
+      getOperandThroughDPSOps(op.getWeight()));
+  auto in2 = op.getODSOperands(2).empty()
+                 ? flatbuffers::Offset<::tt::target::TensorRef>()
+                 : cache.at<::tt::target::TensorRef>(
+                       getOperandThroughDPSOps(op.getBias()));
+  auto output = cache.at<::tt::target::TensorRef>(
+      getOperandThroughDPSOps(op.getResult()));
+  return ::tt::target::ttnn::CreateConv2dOp(
+      *cache.fbb, in0, in1, in2, output, op.getInChannels(),
+      op.getOutChannels(), op.getBatchSize(), op.getInputHeight(),
+      op.getInputWidth(), op.getKernelHeight(), op.getKernelWidth(),
+      op.getStrideHeight(), op.getStrideWidth(), op.getPaddingHeight(),
+      op.getPaddingWidth(), op.getDilationHeight(), op.getDilationWidth(),
+      op.getGroups());
+}
+
 template <typename EltwiseOp>
 ::flatbuffers::Offset<::tt::target::ttnn::EltwiseOp>
 createEltwiseOp(FlatbufferObjectCache &cache, EltwiseOp op) {
@@ -324,6 +346,9 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
     return createOperation(cache, createTransposeOp(cache, transposeOp),
                            debugString);
   }
+  if (auto conv2dOp = dyn_cast<Conv2dOp>(op); conv2dOp) {
+    return createOperation(cache, createOp(cache, conv2dOp), debugString);
+  }
   if (auto concatOp = dyn_cast<ConcatOp>(op); concatOp) {
     return createOperation(cache, createConcatOp(cache, concatOp), debugString);
   }
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 89405df43d..487bfdc779 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -39,8 +39,10 @@
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #pragma clang diagnostic ignored "-Wlogical-op-parentheses"
 #pragma clang diagnostic ignored "-Wundefined-inline"
+
 #define FMT_HEADER_ONLY
 #include "ttnn/device.hpp"
+#include "ttnn/operations/conv/conv2d/conv2d.hpp"
 #include "ttnn/operations/copy.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/creation.hpp"
@@ -59,6 +61,9 @@
 
 namespace tt::runtime::ttnn {
 
+// Default L1 small size to use for the ttnn runtime (32kb).
+constexpr std::size_t kL1SmallSize = 1 << 15;
+
 std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc();
 
 Tensor createTensor(std::shared_ptr<void> data,
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index dd5f0acc78..d43eadba16 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -6,11 +6,14 @@
 #include <cstdint>
 #include <list>
 #include <optional>
+#include <string>
 #include <unordered_map>
 
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/runtime.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
+#include "ttnn/device.hpp"
+#include "ttnn/operations/conv/conv2d/conv2d.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/types.hpp"
 #include "types_generated.h"
@@ -475,6 +478,34 @@ run(::tt::target::ttnn::MatmulOp const *op, ::ttnn::Device &device,
 }
 // ANCHOR_END: adding_an_op_matmul_runtime
 
+static void
+run(::tt::target::ttnn::Conv2dOp const *op, ::ttnn::Device &device,
+    std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
+    std::list<::ttnn::Tensor> &tensorPool) {
+  auto &input = *liveTensors.at(op->input()->global_id());
+  auto &weight = *liveTensors.at(op->weight()->global_id());
+  std::optional<::ttnn::Tensor> bias =
+      op->bias() ? std::make_optional(*liveTensors.at(op->bias()->global_id()))
+                 : std::nullopt;
+  auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
+  config.dtype = input.dtype();
+  config.weights_dtype = weight.dtype();
+
+  ::ttnn::Tensor out =
+      std::get<0>(::ttnn::operations::conv::conv2d::conv2d<::ttnn::Device>(
+          input, weight, &device, op->in_channels(), op->out_channels(),
+          op->batch_size(), op->input_height(), op->input_width(),
+          {op->kernel_height(), op->kernel_width()},
+          {op->stride_height(), op->stride_width()},
+          {op->padding_height(), op->padding_width()},
+          {op->dilation_height(), op->dilation_width()}, op->groups(), bias,
+          config));
+
+  tensorPool.push_back(out);
+  liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
+  return;
+}
+
 static void
 run(::tt::target::ttnn::Operation const *op, ::ttnn::Device &device,
     std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
@@ -516,6 +547,9 @@ run(::tt::target::ttnn::Operation const *op, ::ttnn::Device &device,
   case ::tt::target::ttnn::OpType::TransposeOp: {
     return run(op->type_as_TransposeOp(), device, liveTensors, tensorPool);
   }
+  case ::tt::target::ttnn::OpType::Conv2dOp: {
+    return run(op->type_as_Conv2dOp(), device, liveTensors, tensorPool);
+  }
   case ::tt::target::ttnn::OpType::ConcatOp: {
     return run(op->type_as_ConcatOp(), device, liveTensors, tensorPool);
   case ::tt::target::ttnn::OpType::ReshapeOp: {
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 37a69c154b..9cf0575138 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -60,7 +60,7 @@ Device openDevice(std::vector<int> const &deviceIds,
                   std::vector<std::uint8_t> const &numHWCQs) {
   assert(deviceIds.size() == 1 && "Only one device is supported for now");
   assert(numHWCQs.empty() && "HWCQs are not supported for now");
-  auto &device = ::ttnn::open_device(deviceIds.front());
+  auto &device = ::ttnn::open_device(deviceIds.front(), kL1SmallSize);
   return Device::borrow(device, DeviceRuntime::TTNN);
 }
 
diff --git a/test/ttmlir/Dialect/TTNN/simple_conv.mlir b/test/ttmlir/Dialect/TTNN/simple_conv.mlir
new file mode 100644
index 0000000000..5a016c5966
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/simple_conv.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x32x32x64xbf16>, %arg1: tensor<64x64x3x3xbf16>, %arg2: tensor<1x1x1x64xbf16>) -> tensor<1x32x32x64xbf16> {
+    %0 = tensor.empty() : tensor<1x32x32x64xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.conv2d"[[C:.*]]
+    %1 = "ttir.conv2d"(%arg0, %arg1, %arg2, %0) <{stride_height=1: si32, stride_width=1: si32, dilation_height=1: si32, dilation_width=1: si32, groups=1: si32, padding_left=1: si32, padding_right=1: si32, padding_top=1: si32, padding_bottom=1: si32, is_convtranspose2d=0: si32, output_height_transpose=0: si32, output_width_transpose=0: si32, stride_transpose=0: si32, operand_constraints = [#any_device, #any_device, #any_device, #any_device]}> : (tensor<1x32x32x64xbf16>, tensor<64x64x3x3xbf16>, tensor<1x1x1x64xbf16>, tensor<1x32x32x64xbf16>) -> tensor<1x32x32x64xbf16>
+    return %1 : tensor<1x32x32x64xbf16>
+  }
+}

From e795e09c45c4e26c22eeb2ce0b8380d5607093a0 Mon Sep 17 00:00:00 2001
From: Milan Topalovic <163355844+mtopalovicTT@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:25:05 +0200
Subject: [PATCH 03/16] Adding negative dim support for `squeeze` (#561)

Adding negative dim support for `squeeze`
---
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp     | 4 ++++
 lib/Dialect/TTIR/IR/TTIROps.cpp              | 4 ++++
 test/ttmlir/Dialect/TTNN/simple_squeeze.mlir | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index 6f51eaa580..f5c3c460c5 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -200,6 +200,10 @@ class SqueezeOpConversionPattern : public OpConversionPattern<ttir::SqueezeOp> {
     // Get the squeeze dimension
     int32_t dim = adaptor.getDim();
 
+    if (dim < 0) {
+      dim += inputType.getRank();
+    }
+
     // Get the shape of the input tensor
     auto inputShape = inputType.getShape();
     llvm::SmallVector<int32_t, 4> newShape;
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 49d2a97429..e26d50e8d8 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -267,6 +267,10 @@ ::mlir::LogicalResult mlir::tt::ttir::SqueezeOp::verify() {
   ::mlir::RankedTensorType outputType = getOutput().getType();
   int32_t dim = getDim();
 
+  if (dim < 0) {
+    dim += inputType.getRank();
+  }
+
   // Check that the dimension `dim` is valid.
   if (dim < 0 || dim >= inputType.getRank()) {
     return emitOpError() << "Invalid dimension " << dim << " for squeezing.";
diff --git a/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir b/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir
index 1798605d8b..34367c4736 100644
--- a/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_squeeze.mlir
@@ -4,7 +4,7 @@ module attributes {} {
   func.func @forward(%arg0: tensor<1x2x1x32x32xbf16>) -> tensor<1x2x32x32xbf16> {
     %0 = tensor.empty() : tensor<1x2x32x32xbf16>
     // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
-    %1 = "ttir.squeeze"(%arg0, %0) <{dim = 2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16>
+    %1 = "ttir.squeeze"(%arg0, %0) <{dim = -3 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16>
     return %1 : tensor<1x2x32x32xbf16>
   }
 }

From 8cc0f058037b7db39f0cf51c62d0c1311950b85f Mon Sep 17 00:00:00 2001
From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com>
Date: Fri, 30 Aug 2024 13:27:09 -0500
Subject: [PATCH 04/16] #445: Added load system desc path option in ttir to
 ttnn backend pipeline. Migrated more tests from ttir into ttnn silicon (#556)

---
 docs/src/ttrt.md                              |  2 +
 .../ttmlir/Dialect/TTNN/Pipelines/Passes.h    |  7 ++
 lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp  |  6 +-
 .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir |  2 +-
 .../TTNN/eltwise/unary/simple_reciprocal.mlir | 15 ----
 .../TTNN/eltwise/unary/simple_sqrt.mlir       | 15 ----
 .../TTNN/embedding/embedding_1d_tensor.mlir   | 16 ++++
 .../TTNN/embedding/embedding_non_tile.mlir    | 16 ++++
 .../TTNN/embedding/simple_embedding.mlir      | 16 ++++
 .../Silicon/TTNN/operand_broadcasts.mlir      | 26 +++++++
 test/ttmlir/Silicon/TTNN/simple_div.mlir      | 16 ----
 test/ttmlir/Silicon/TTNN/simple_eltwise.mlir  | 74 ++++++++++++++++++-
 test/ttmlir/Silicon/TTNN/simple_ge.mlir       | 16 ----
 test/ttmlir/Silicon/TTNN/simple_matmul.mlir   |  4 +-
 test/ttmlir/Silicon/TTNN/simple_mean.mlir     | 16 ++++
 test/ttmlir/Silicon/TTNN/simple_multiply.mlir | 16 ----
 test/ttmlir/Silicon/TTNN/simple_nop.mlir      |  1 -
 test/ttmlir/Silicon/TTNN/simple_relu.mlir     | 16 ----
 test/ttmlir/Silicon/TTNN/simple_subtract.mlir | 16 ----
 test/ttmlir/Silicon/TTNN/simple_sum.mlir      |  4 +-
 test/ttmlir/Silicon/TTNN/transpose.mlir       | 33 +++++++++
 21 files changed, 213 insertions(+), 120 deletions(-)
 delete mode 100644 test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_div.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_ge.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/simple_mean.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_multiply.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_relu.mlir
 delete mode 100644 test/ttmlir/Silicon/TTNN/simple_subtract.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/transpose.mlir

diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md
index 26be66bcff..297f505bd4 100644
--- a/docs/src/ttrt.md
+++ b/docs/src/ttrt.md
@@ -63,6 +63,8 @@ ttrt query --save-artifacts
 4. Use ttmlir-opt tool in compiler to feed system descriptor. See the [ttmlir-opt](./ttmlir-opt.md) documentation for more information on how to generate .mlir files.
 ```bash
 ./build/bin/ttmlir-opt --ttir-load-system-desc="path=/path/to/system_desc.ttsys" --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir
+or (pip path directly into ttir-to-ttnn-backend-pipeline)
+./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=/path/to/system_desc.ttsys" test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir
 ```
 5. Use ttmlir-translate tool in compiler to generate the flatbuffer executable. See the [ttmlir-translate](./ttmlir-translate.md) documentation for more information on how to generate flatbuffer files.
 ```bash
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
index 141745a1dc..c44dc99e94 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
@@ -87,6 +87,13 @@ struct TTIRToTTNNBackendPipelineOptions
           *this, "override-grid-sizes",
           llvm::cl::desc("Override grid sizes for specific ops."),
           llvm::cl::init(llvm::StringMap<SmallVector<int64_t, 2>>())};
+
+  // Option to provide a system descriptor flatbuffer file to compile against
+  Option<std::string> systemDescPath{
+      *this, "system-desc-path",
+      llvm::cl::desc(
+          "Pass in a system descriptor flatbuffer to compile against."),
+      llvm::cl::init("")};
 };
 
 void createTTIRToTTNNBackendPipeline(
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index 9a84025e6d..cb78dfd682 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -15,7 +15,11 @@ namespace mlir::tt::ttnn {
 
 void createTTIRToTTNNBackendPipeline(
     OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
-  pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc());
+
+  ttir::TTIRLoadSystemDescOptions systemDescOptions;
+  systemDescOptions.path = options.systemDescPath;
+  pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc(systemDescOptions));
+
   pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice());
   mlir::tt::ttir::TTIRLayoutOptions layoutOptions;
   layoutOptions.initMemorySpace = mlir::tt::MemorySpace::System;
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index 560deec7d3..1664ceba3e 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 // UNSUPPORTED: true
 #l1_ = #tt.memory_space<l1>
 
diff --git a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir b/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir
deleted file mode 100644
index 2239c3b0cc..0000000000
--- a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_reciprocal.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttir-load-system-desc="path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.reciprocal"[[C:.*]]
-    %1 = "ttir.reciprocal"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir b/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir
deleted file mode 100644
index 7a7111d338..0000000000
--- a/test/ttmlir/Silicon/TTNN/eltwise/unary/simple_sqrt.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttir-load-system-desc="path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.sqrt"[[C:.*]]
-    %1 = "ttir.sqrt"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir b/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir
new file mode 100644
index 0000000000..8c16055cf6
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/embedding/embedding_1d_tensor.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<32xf32>, %arg1: tensor<512x128xf32>) -> tensor<32x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<32x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
+    %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32xf32>, tensor<512x128xf32>, tensor<32x128xf32>) -> tensor<32x128xf32>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<32x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir b/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir
new file mode 100644
index 0000000000..c538cf9e41
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/embedding/embedding_non_tile.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<1x32xf32>, %arg1: tensor<512x128xf32>) -> tensor<1x32x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<1x32x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
+    %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32xf32>, tensor<512x128xf32>, tensor<1x32x128xf32>) -> tensor<1x32x128xf32>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<1x32x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir b/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir
new file mode 100644
index 0000000000..4daa472fb4
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/embedding/simple_embedding.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<32x32xf32>, %arg1: tensor<512x128xf32>) -> tensor<32x32x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<32x32x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.embedding"[[C:.*]]
+    %1 = "ttir.embedding"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xf32>, tensor<512x128xf32>, tensor<32x32x128xf32>) -> tensor<32x32x128xf32>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<32x32x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir b/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir
new file mode 100644
index 0000000000..1bcea0d1c3
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/operand_broadcasts.mlir
@@ -0,0 +1,26 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @bcast_one_dim(%arg0: tensor<2x64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<2x64x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<2x64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2x64x128xf32>, tensor<64x128xf32>, tensor<2x64x128xf32>) -> tensor<2x64x128xf32>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<2x64x128xf32>
+  }
+
+  func.func @bcast_multi_dim(%arg0: tensor<17x16x15x14xf32>, %arg1: tensor<15x1xf32>) -> tensor<17x16x15x14xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<17x16x15x14xf32>
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<17x16x15x14xf32>, tensor<15x1xf32>, tensor<17x16x15x14xf32>) -> tensor<17x16x15x14xf32>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<17x16x15x14xf32>
+  }
+
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_div.mlir b/test/ttmlir/Silicon/TTNN/simple_div.mlir
deleted file mode 100644
index f5c7ee878d..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_div.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.div"[[C:.*]]
-    %1 = "ttir.div"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index ed3829935c..6afbe4d8e1 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -1,8 +1,8 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
 func.func @subtract(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
   // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
@@ -53,3 +53,73 @@ func.func @ge(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64
   // CHECK: "ttnn.close_device"[[C:.*]]
   return %1 : tensor<64x128xf32>
 }
+
+func.func @concat(%arg0: tensor<32x32xf32>, %arg1: tensor<32x64xf32>) -> tensor<32x96xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<32x96xf32>
+  // CHECK: %[[C:.*]] = "ttnn.concat"[[C:.*]]
+  %1 = "ttir.concat"(%arg0, %arg1, %0) <{dim = 1 : si32, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xf32>, tensor<32x64xf32>, tensor<32x96xf32>) -> tensor<32x96xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<32x96xf32>
+}
+
+func.func @reshape(%arg0: tensor<4x2x32x32xbf16>) -> tensor<2x4x32x32xbf16> {
+  %0 = tensor.empty() : tensor<2x4x32x32xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
+  %1 = "ttir.reshape"(%arg0, %0) <{shape = [2: i32, 4: i32, 32: i32, 32: i32] , operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<4x2x32x32xbf16>, tensor<2x4x32x32xbf16>) -> tensor<2x4x32x32xbf16>
+  return %1 : tensor<2x4x32x32xbf16>
+}
+
+func.func @squeeze(%arg0: tensor<1x2x1x32x32xbf16>) -> tensor<1x2x32x32xbf16> {
+  %0 = tensor.empty() : tensor<1x2x32x32xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
+  %1 = "ttir.squeeze"(%arg0, %0) <{dim = 2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<1x2x1x32x32xbf16>, tensor<1x2x32x32xbf16>) -> tensor<1x2x32x32xbf16>
+  return %1 : tensor<1x2x32x32xbf16>
+}
+
+func.func @reciprocal(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.reciprocal"[[C:.*]]
+  %1 = "ttir.reciprocal"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @sigmoid(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.sigmoid"[[C:.*]]
+  %1 = "ttir.sigmoid"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @sqrt(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.sqrt"[[C:.*]]
+  %1 = "ttir.sqrt"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @softmax(%arg0: tensor<512x1024xbf16>) -> tensor<512x1024xbf16> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<512x1024xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
+  // Check for positive dimension attribute
+  %1 = "ttir.softmax"(%arg0, %0) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %2 = tensor.empty() : tensor<512x1024xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
+  // Check for negative dimension attribute
+  %3 = "ttir.softmax"(%1, %2) <{dimension = -1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %3 : tensor<512x1024xbf16>
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_ge.mlir b/test/ttmlir/Silicon/TTNN/simple_ge.mlir
deleted file mode 100644
index c2efad81c2..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_ge.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.ge"[[C:.*]]
-    %1 = "ttir.ge"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
index c97518ce7d..a90e7817b5 100644
--- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
@@ -1,8 +1,8 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Silicon/TTNN/simple_mean.mlir b/test/ttmlir/Silicon/TTNN/simple_mean.mlir
new file mode 100644
index 0000000000..c3705a6ff0
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_mean.mlir
@@ -0,0 +1,16 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
+#any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
+module {
+  func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+    %0 = tensor.empty() : tensor<512x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.mean"[[C:.*]]
+    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [-1: i32], keep_dim = true, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x32xbf16>) -> tensor<512x32xbf16>
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<512x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_multiply.mlir b/test/ttmlir/Silicon/TTNN/simple_multiply.mlir
deleted file mode 100644
index 91792a2ceb..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_multiply.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
-    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/simple_nop.mlir b/test/ttmlir/Silicon/TTNN/simple_nop.mlir
index 0bce6b0a15..7cf9b1bd20 100644
--- a/test/ttmlir/Silicon/TTNN/simple_nop.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_nop.mlir
@@ -1,7 +1,6 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
 module @jit_convert_element_type attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} {
   func.func public @main(%arg0: tensor<2x2xf32> {mhlo.layout_mode = "default"}) -> (tensor<2x2xf32> {jax.result_info = "", mhlo.layout_mode = "default"}) {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTNN/simple_relu.mlir b/test/ttmlir/Silicon/TTNN/simple_relu.mlir
deleted file mode 100644
index c53100894b..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_relu.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]]
-    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/simple_subtract.mlir b/test/ttmlir/Silicon/TTNN/simple_subtract.mlir
deleted file mode 100644
index b15f3d2775..0000000000
--- a/test/ttmlir/Silicon/TTNN/simple_subtract.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
-    %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/simple_sum.mlir b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
index a976499643..9af10c8a8b 100644
--- a/test/ttmlir/Silicon/TTNN/simple_sum.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
@@ -1,9 +1,7 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-// https://github.com/tenstorrent/tt-mlir/issues/528
 // UNSUPPORTED: true
-
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
diff --git a/test/ttmlir/Silicon/TTNN/transpose.mlir b/test/ttmlir/Silicon/TTNN/transpose.mlir
new file mode 100644
index 0000000000..184b6b8076
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/transpose.mlir
@@ -0,0 +1,33 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+
+func.func @transpose(%arg0: tensor<64x128xbf16>) -> tensor<128x64xbf16> {
+  %0 = tensor.empty() : tensor<128x64xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]]
+  %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 0 : si32, dim1 = 1 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x64xbf16>) -> tensor<128x64xbf16>
+  return %1 : tensor<128x64xbf16>
+}
+
+func.func @transpose_8x8(%arg0: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+  %0 = tensor.empty() : tensor<32x32xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]]
+  %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 0 : si32, dim1 = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+  return %1 : tensor<32x32xbf16>
+}
+
+func.func @transpose_8x16_reverse_dims(%arg0: tensor<64x16xbf16>) -> tensor<16x64xbf16> {
+  %0 = tensor.empty() : tensor<16x64xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]]
+  %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 1 : si32, dim1 = 0 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<64x16xbf16>, tensor<16x64xbf16>) -> tensor<16x64xbf16>
+  return %1 : tensor<16x64xbf16>
+}
+
+func.func @transpose_negative_dims(%arg0: tensor<32x32xbf16>) -> tensor<32x32xbf16> {
+  %0 = tensor.empty() : tensor<32x32xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.transpose"[[C:.*]]
+  %1 = "ttir.transpose"(%arg0, %0) <{dim0 = -1 : si32, dim1 = -2 : si32, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16>
+  return %1 : tensor<32x32xbf16>
+}

From 19c54072584ae3ba0c5be20c536b3a82c1e42cab Mon Sep 17 00:00:00 2001
From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com>
Date: Fri, 30 Aug 2024 16:13:46 -0500
Subject: [PATCH 05/16] #563: Enable mixed ttnn and ttm runtime in ttrt (#565)

---
 runtime/tools/python/CMakeLists.txt     |   2 +-
 runtime/tools/python/ttrt/common/api.py | 173 ++++++++++++------------
 2 files changed, 89 insertions(+), 86 deletions(-)

diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt
index 791541cbb1..84810a4cf3 100644
--- a/runtime/tools/python/CMakeLists.txt
+++ b/runtime/tools/python/CMakeLists.txt
@@ -4,7 +4,7 @@ add_custom_target(ttrt-copy-files
 )
 
 add_custom_target(ttrt
-  COMMAND rm -f *.whl
+  COMMAND rm -f build/*.whl
   COMMAND TTMLIR_ENABLE_RUNTIME=${TTMLIR_ENABLE_RUNTIME}
           TT_RUNTIME_ENABLE_TTNN=${TT_RUNTIME_ENABLE_TTNN}
           TT_RUNTIME_ENABLE_TTMETAL=${TT_RUNTIME_ENABLE_TTMETAL}
diff --git a/runtime/tools/python/ttrt/common/api.py b/runtime/tools/python/ttrt/common/api.py
index f2c63a80ac..4b2d3e4185 100644
--- a/runtime/tools/python/ttrt/common/api.py
+++ b/runtime/tools/python/ttrt/common/api.py
@@ -793,112 +793,115 @@ def _execute(binaries):
                 self.logging.debug(f"setting torch manual seed={self['seed']}")
                 torch.manual_seed(self["seed"])
                 ttrt.runtime.set_compatible_runtime(binaries[0].fbb)
-
                 self.logging.debug(f"opening device id={self.query.device_ids[0]}")
                 device = ttrt.runtime.open_device([self.query.device_ids[0]])
-                atexit.register(lambda: ttrt.runtime.close_device(device))
-
-                for bin in binaries:
-                    self.logging.info(f"evaluating binary={bin.file_path}")
-
-                    program_indices = []
-                    if self["program_index"] == "all":
-                        program_indices.extend(range(bin.get_num_programs()))
-                    else:
-                        program_indices.append(int(self["program_index"]))
 
-                    for program_index in program_indices:
-                        self.logging.debug(
-                            f"evaluating program={program_index} for binary={bin.file_path}"
-                        )
+                try:
+                    for bin in binaries:
+                        self.logging.info(f"evaluating binary={bin.file_path}")
 
-                        program = bin.get_program(program_index)
-                        program.populate_inputs(
-                            API.Run.TorchInitilizer.get_initilizer(self["init"])
-                        )
-                        program.populate_outputs(
-                            API.Run.TorchInitilizer.get_initilizer("zeros")
-                        )
+                        program_indices = []
+                        if self["program_index"] == "all":
+                            program_indices.extend(range(bin.get_num_programs()))
+                        else:
+                            program_indices.append(int(self["program_index"]))
 
-                        total_inputs = []
-                        total_outputs = []
-                        for loop in range(self["loops"]):
+                        for program_index in program_indices:
                             self.logging.debug(
-                                f"generating inputs/outputs for loop={loop+1}/{self['loops']} for binary={bin.file_path}"
+                                f"evaluating program={program_index} for binary={bin.file_path}"
                             )
 
-                            inputs = []
-                            outputs = []
-                            for i in program.input_tensors:
-                                inputs.append(
-                                    ttrt.runtime.create_tensor(
-                                        i.data_ptr(),
-                                        list(i.shape),
-                                        list(i.stride()),
-                                        i.element_size(),
-                                        Binary.Program.to_data_type(i.dtype),
-                                    )
+                            program = bin.get_program(program_index)
+                            program.populate_inputs(
+                                API.Run.TorchInitilizer.get_initilizer(self["init"])
+                            )
+                            program.populate_outputs(
+                                API.Run.TorchInitilizer.get_initilizer("zeros")
+                            )
+
+                            total_inputs = []
+                            total_outputs = []
+                            for loop in range(self["loops"]):
+                                self.logging.debug(
+                                    f"generating inputs/outputs for loop={loop+1}/{self['loops']} for binary={bin.file_path}"
                                 )
 
-                            for i in program.output_tensors:
-                                outputs.append(
-                                    ttrt.runtime.create_tensor(
-                                        i.data_ptr(),
-                                        list(i.shape),
-                                        list(i.stride()),
-                                        i.element_size(),
-                                        Binary.Program.to_data_type(i.dtype),
+                                inputs = []
+                                outputs = []
+                                for i in program.input_tensors:
+                                    inputs.append(
+                                        ttrt.runtime.create_tensor(
+                                            i.data_ptr(),
+                                            list(i.shape),
+                                            list(i.stride()),
+                                            i.element_size(),
+                                            Binary.Program.to_data_type(i.dtype),
+                                        )
                                     )
-                                )
 
-                            total_inputs.append(inputs)
-                            total_outputs.append(outputs)
+                                for i in program.output_tensors:
+                                    outputs.append(
+                                        ttrt.runtime.create_tensor(
+                                            i.data_ptr(),
+                                            list(i.shape),
+                                            list(i.stride()),
+                                            i.element_size(),
+                                            Binary.Program.to_data_type(i.dtype),
+                                        )
+                                    )
 
-                        event = None
-                        for loop in range(self["loops"]):
-                            self.logging.debug(
-                                f"starting loop={loop+1}/{self['loops']} for binary={bin.file_path}"
-                            )
+                                total_inputs.append(inputs)
+                                total_outputs.append(outputs)
 
-                            event = ttrt.runtime.submit(
-                                device,
-                                bin.fbb,
-                                program_index,
-                                total_inputs[loop],
-                                total_outputs[loop],
-                            )
+                            event = None
+                            for loop in range(self["loops"]):
+                                self.logging.debug(
+                                    f"starting loop={loop+1}/{self['loops']} for binary={bin.file_path}"
+                                )
 
-                            self.logging.debug(
-                                f"finished loop={loop+1}/{self['loops']} for binary={bin.file_path}"
-                            )
+                                event = ttrt.runtime.submit(
+                                    device,
+                                    bin.fbb,
+                                    program_index,
+                                    total_inputs[loop],
+                                    total_outputs[loop],
+                                )
 
-                        ttrt.runtime.wait(event)
+                                self.logging.debug(
+                                    f"finished loop={loop+1}/{self['loops']} for binary={bin.file_path}"
+                                )
 
-                        if self["identity"]:
-                            self.logging.debug(
-                                f"checking identity with rtol={self['rtol']} and atol={self['atol']}"
-                            )
+                            ttrt.runtime.wait(event)
 
-                            for i, o in zip(
-                                program.input_tensors, program.output_tensors
-                            ):
-                                if not torch.allclose(
-                                    i, o, rtol=self["rtol"], atol=self["atol"]
+                            if self["identity"]:
+                                self.logging.debug(
+                                    f"checking identity with rtol={self['rtol']} and atol={self['atol']}"
+                                )
+
+                                for i, o in zip(
+                                    program.input_tensors, program.output_tensors
                                 ):
-                                    self.logging.error(
-                                        f"Failed: inputs and outputs do not match in binary"
-                                    )
-                                    self.logging.error(i - o)
+                                    if not torch.allclose(
+                                        i, o, rtol=self["rtol"], atol=self["atol"]
+                                    ):
+                                        self.logging.error(
+                                            f"Failed: inputs and outputs do not match in binary"
+                                        )
+                                        self.logging.error(i - o)
 
-                        self.logging.debug(f"input tensors for program={program_index}")
-                        for tensor in program.input_tensors:
-                            self.logging.debug(f"{tensor}\n")
+                            self.logging.debug(
+                                f"input tensors for program={program_index}"
+                            )
+                            for tensor in program.input_tensors:
+                                self.logging.debug(f"{tensor}\n")
 
-                        self.logging.debug(
-                            f"output tensors for program={program_index}"
-                        )
-                        for tensor in program.output_tensors:
-                            self.logging.debug(f"{tensor}\n")
+                            self.logging.debug(
+                                f"output tensors for program={program_index}"
+                            )
+                            for tensor in program.output_tensors:
+                                self.logging.debug(f"{tensor}\n")
+                finally:
+                    ttrt.runtime.close_device(device)
 
             self.logging.debug(f"executing ttnn binaries")
             _execute(self.ttnn_binaries)

From 0b83c39218dbfd8bfa5242b24a3bfda352188ddb Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Sat, 31 Aug 2024 07:36:58 -0700
Subject: [PATCH 06/16] Add a new TTIR Layout pass option defaultMemorySpace
 (#564)

---
 .../ttmlir/Dialect/TTIR/Transforms/Passes.td  |  4 ++
 lib/Dialect/TTIR/Transforms/Passes.cpp        | 47 +++++++++++++++----
 lib/Dialect/TTMetal/Transforms/Passes.cpp     |  1 +
 lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp  |  1 +
 test/ttmlir/Dialect/TTIR/test_grid_set.mlir   |  2 +-
 .../Dialect/TTNN/multiple_add_with_loc.mlir   |  2 +-
 .../multiple_add_with_loc_grid_override.mlir  |  4 +-
 test/ttmlir/Dialect/TTNN/simple_matmul.mlir   |  2 +-
 .../Dialect/TTNN/ttir_to_ttnn_pipeline.mlir   |  2 +-
 .../ttir_to_ttnn_pipeline_custom_opt.mlir     |  2 +-
 10 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
index de7ac591af..c5a67e76c5 100644
--- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
@@ -47,6 +47,10 @@ def TTIRLayout: Pass<"ttir-layout", "::mlir::ModuleOp"> {
           "::mlir::tt::MemorySpace",
           /*default=*/"::mlir::tt::MemorySpace::System",
            "Set the initial memory space for tensors to start in">,
+    Option<"defaultMemorySpace", "default-memory-space",
+          "::mlir::tt::MemorySpace",
+          /*default=*/"::mlir::tt::MemorySpace::DeviceDRAM",
+           "Set the default memory space for layout pass to prefer for operation operands, if not constrained">,
   ];
 }
 
diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp
index bf303f65e5..9e77b4c66c 100644
--- a/lib/Dialect/TTIR/Transforms/Passes.cpp
+++ b/lib/Dialect/TTIR/Transforms/Passes.cpp
@@ -420,13 +420,31 @@ inline MemorySpace getMemorySpace(RankedTensorType ty) {
   return getMemorySpace(layout);
 }
 
-inline MemorySpace uppermostMemorySpace(OperandConstraint operandConstraint) {
-  if (bitEnumContainsAny(operandConstraint, OperandConstraint::L1)) {
-    return MemorySpace::DeviceL1;
+inline OperandConstraint
+memorySpaceAsOperandConstraint(MemorySpace memorySpace) {
+  switch (memorySpace) {
+  case MemorySpace::System:
+  case MemorySpace::SystemMMIO:
+    return OperandConstraint::System;
+  case MemorySpace::DeviceDRAM:
+    return OperandConstraint::DRAM;
+  case MemorySpace::DeviceL1:
+    return OperandConstraint::L1;
+  }
+}
+
+inline MemorySpace getLegalMemorySpace(OperandConstraint operandConstraint,
+                                       MemorySpace defaultMemorySpace) {
+  if (bitEnumContainsAny(operandConstraint,
+                         memorySpaceAsOperandConstraint(defaultMemorySpace))) {
+    return defaultMemorySpace;
   }
   if (bitEnumContainsAny(operandConstraint, OperandConstraint::DRAM)) {
     return MemorySpace::DeviceDRAM;
   }
+  if (bitEnumContainsAny(operandConstraint, OperandConstraint::L1)) {
+    return MemorySpace::DeviceL1;
+  }
   return MemorySpace::System;
 }
 
@@ -547,8 +565,10 @@ static std::optional<Value> createToLayoutOp(PatternRewriter &rewriter,
 
 static std::optional<Value>
 createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
-                 OperandConstraint operandConstraint) {
-  auto desiredMemorySpace = uppermostMemorySpace(operandConstraint);
+                 OperandConstraint operandConstraint,
+                 MemorySpace defaultMemorySpace) {
+  auto desiredMemorySpace =
+      getLegalMemorySpace(operandConstraint, defaultMemorySpace);
   bool tiled =
       !bitEnumContainsAny(operandConstraint, OperandConstraint::Scalar);
   return createToLayoutOp(rewriter, loc, input, desiredMemorySpace, tiled);
@@ -557,8 +577,10 @@ createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
 class TTIRLayoutDPSOperandsRewriter
     : public OpInterfaceRewritePattern<DestinationStyleOpInterface> {
 public:
-  using OpInterfaceRewritePattern<
-      DestinationStyleOpInterface>::OpInterfaceRewritePattern;
+  TTIRLayoutDPSOperandsRewriter(MLIRContext *ctx,
+                                MemorySpace defaultMemorySpace)
+      : OpInterfaceRewritePattern<DestinationStyleOpInterface>(ctx),
+        defaultMemorySpace(defaultMemorySpace) {}
 
   LogicalResult matchAndRewrite(DestinationStyleOpInterface op,
                                 PatternRewriter &rewriter) const final {
@@ -582,8 +604,9 @@ class TTIRLayoutDPSOperandsRewriter
               mlir::cast<TTIROp>(op.getOperation())
                   .getOperandConstraints()[operand.getOperandNumber()])
               .getValue();
-      auto desiredLayout = createToLayoutOp(rewriter, op.getLoc(),
-                                            operand.get(), operandConstraint);
+      auto desiredLayout =
+          createToLayoutOp(rewriter, op.getLoc(), operand.get(),
+                           operandConstraint, defaultMemorySpace);
 
       if (desiredLayout) {
         rewriter.modifyOpInPlace(op, [&]() {
@@ -599,6 +622,9 @@ class TTIRLayoutDPSOperandsRewriter
 
     return modified ? success() : failure();
   }
+
+private:
+  MemorySpace defaultMemorySpace;
 };
 
 class TTIRLayoutFuncReturnRewriter
@@ -650,7 +676,8 @@ class TTIRLayout : public impl::TTIRLayoutBase<TTIRLayout> {
     }
     {
       RewritePatternSet patterns(&getContext());
-      patterns.add<TTIRLayoutDPSOperandsRewriter>(&getContext());
+      patterns.add<TTIRLayoutDPSOperandsRewriter>(&getContext(),
+                                                  defaultMemorySpace);
       patterns.add<TTIRLayoutFuncReturnRewriter>(&getContext(),
                                                  initMemorySpace);
       FrozenRewritePatternSet patternSet(std::move(patterns));
diff --git a/lib/Dialect/TTMetal/Transforms/Passes.cpp b/lib/Dialect/TTMetal/Transforms/Passes.cpp
index 4146db7c20..76f7763a8c 100644
--- a/lib/Dialect/TTMetal/Transforms/Passes.cpp
+++ b/lib/Dialect/TTMetal/Transforms/Passes.cpp
@@ -840,6 +840,7 @@ void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) {
   pm.addPass(mlir::tt::ttir::createTTIRGenericRegion());
   mlir::tt::ttir::TTIRLayoutOptions layoutOptions;
   layoutOptions.initMemorySpace = mlir::tt::MemorySpace::DeviceL1;
+  layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceL1;
   pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions));
   pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref());
   pm.addPass(mlir::tt::ttir::createTTIRAllocate());
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index cb78dfd682..56c05e8fe9 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -23,6 +23,7 @@ void createTTIRToTTNNBackendPipeline(
   pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice());
   mlir::tt::ttir::TTIRLayoutOptions layoutOptions;
   layoutOptions.initMemorySpace = mlir::tt::MemorySpace::System;
+  layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceDRAM;
   pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions));
 
   if (options.gridSetPassEnabled) {
diff --git a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
index bf6eae61e9..0860ff4dab 100644
--- a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
@@ -3,7 +3,7 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>>
     // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]] -> tensor<64x128xf32, #[[LAYOUT_1]]>
     %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
     return %1 : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
index 5ba74e6f68..a8616f152b 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
@@ -3,7 +3,7 @@
 #loc = loc("test_ops.py:17_0_0":0:0)
 module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
index ae356c4811..adf62660bc 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
@@ -4,8 +4,8 @@
 module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #[[LAYOUT_0:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #l1_>>
-    // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #dram>>
+    // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
index 992b0c21db..f8ee937e74 100644
--- a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>>
+// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
index 00c67542ad..cfdfde2d14 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
index 7b1d1ee475..e1acc7c802 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #dram>>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>

From ab9a8b29dc8276554a07d837ddf9eccbcda3b7cf Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Sat, 31 Aug 2024 11:13:33 -0700
Subject: [PATCH 07/16] Build fix (#567)

---
 test/ttmlir/Silicon/TTNN/simple_matmul.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
index a90e7817b5..fdee7305f9 100644
--- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>>
+// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>

From fd466fc84a250a348c61f96d834ab3885aa2a958 Mon Sep 17 00:00:00 2001
From: Kyle Mabee <118925087+kmabeeTT@users.noreply.github.com>
Date: Sat, 31 Aug 2024 18:47:12 -0400
Subject: [PATCH 08/16] Various runtime::ttmetal CQExecutor buffer map
 improvements / segfault workaround / TTMetal Tests (#529)

* A pair of runtime::ttmetal CQExecutor buffer map improvements (#408)
 - Prevent duplicate Buffers from being created inside
   CreateBufferCommand handler by checking for existence in buffers umap.
 - Change to use buffers.erase() in DeallocateBufferCommand to
   actually remove the entry from buffers umap. Buffer will
   still be destroyed because it goes out of scope.
 - Neither of these help with the original segfault in this ticket
   but these were found through visual observation.

* Temporary Workaround for tt-metal Segfaults during teardown (#408)
 - A hack, in createBufferFromTensorRef(), remove when proper
   bug fix is made in tt-metal and propagates here.

* Remove UNSUPPORTED: true flag from tests now that CI mixing tests issue resolved
 - Update test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir with CHECK to avoid
   errors and add missing flag --ttmetal-serialize-to-binary to run ttm binary in CI
---
 runtime/include/tt/runtime/detail/ttmetal.h   | 13 ++++++++++
 runtime/lib/ttmetal/command_queue.cpp         |  8 +++---
 .../Silicon/TTMetal/simple_eltwise.mlir       |  1 -
 .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir | 25 +++++++++++++++++--
 test/ttmlir/Silicon/TTMetal/to_layout.mlir    |  1 -
 5 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index 964caa6a5e..b79bde0e14 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -161,6 +161,19 @@ createBufferFromTensorRef(::tt::tt_metal::Device *device,
   std::shared_ptr<::tt::tt_metal::Buffer> buffer =
       ::tt::tt_metal::CreateBuffer(shardedBufferConfig);
   assert(tensorRef->address());
+
+  // Issue #408: Temporary Hack, remove when fix available.
+  // Update tt-metal BUFFER_MAP with updated address and remove
+  // entry for original alloc'd address.
+  auto &buffer_map = tt::tt_metal::detail::BUFFER_MAP;
+  auto map_copy = buffer_map.value();
+  auto old_key = std::make_tuple(device->id(), buffer->address());
+  if (auto it = map_copy.find(old_key); it != map_copy.end()) {
+    auto new_key = std::make_tuple(device->id(), tensorRef->address());
+    buffer_map.insert(new_key, it->second);
+    buffer_map.erase(old_key);
+  }
+
   buffer->set_address(tensorRef->address());
   return buffer;
 }
diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp
index 5899896530..d93e012c74 100644
--- a/runtime/lib/ttmetal/command_queue.cpp
+++ b/runtime/lib/ttmetal/command_queue.cpp
@@ -242,8 +242,10 @@ void CQExecutor::execute(
 
 void CQExecutor::execute(
     ::tt::target::metal::CreateBufferCommand const *command) {
-  buffers[command->ref()->global_id()] =
-      createBufferFromTensorRef(device, command->ref());
+  if (buffers.find(command->ref()->global_id()) == buffers.end()) {
+    buffers[command->ref()->global_id()] =
+        createBufferFromTensorRef(device, command->ref());
+  }
 }
 
 void CQExecutor::execute(
@@ -252,7 +254,7 @@ void CQExecutor::execute(
   assert(iter != buffers.end() && "Buffer not allocated");
   assert(iter->second != nullptr && "Buffer already deallocated");
   ::tt::tt_metal::DeallocateBuffer(*iter->second);
-  iter->second.reset();
+  buffers.erase(iter);
 }
 
 void CQExecutor::execute(
diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
index 494e3f19a6..fdd65864df 100644
--- a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
@@ -1,5 +1,4 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
-// UNSUPPORTED: true
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 
 func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index 1664ceba3e..1cebfe4515 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,5 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
-// UNSUPPORTED: true
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
 #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
@@ -7,11 +6,17 @@
 #tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
 #untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
 func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #untilized>, tensor<64x128xf32, #tilized>) -> tensor<64x128xf32, #tilized>
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<64x128xf32, #tilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<64x128xf32, #tilized>, tensor<64x128xf32, #tilized2x2>) -> tensor<64x128xf32, #tilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<64x128xf32, #untilized2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<64x128xf32, #tilized2x2>, tensor<64x128xf32, #untilized2x2>) -> tensor<64x128xf32, #untilized2x2>
   return %5 : tensor<64x128xf32, #untilized2x2>
 }
@@ -22,13 +27,19 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64
 #tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
 #untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
 func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> {
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<2x3x64x128xf32, #untilized4D>, tensor<2x3x64x128xf32, #tilized4D>) -> tensor<2x3x64x128xf32, #tilized4D>
 
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D_2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<2x3x64x128xf32, #tilized4D>, tensor<2x3x64x128xf32, #tilized4D_2x2>) -> tensor<2x3x64x128xf32, #tilized4D_2x2>
 
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<2x3x64x128xf32, #untilized4D_2x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<2x3x64x128xf32, #tilized4D_2x2>, tensor<2x3x64x128xf32, #untilized4D_2x2>) -> tensor<2x3x64x128xf32, #untilized4D_2x2>
 
   return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2>
@@ -40,23 +51,33 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten
 #tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
 func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> {
   // move to tilized 1x1
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<96x192xf32, #tilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %1 = "ttir.to_layout"(%arg0, %0) : (tensor<96x192xf32, #untilized_big>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>
 
   // move to tilized 2x3
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %2 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x2>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %3 = "ttir.to_layout"(%1, %2) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #tilized_big_3x2>) -> tensor<96x192xf32, #tilized_big_3x2>
 
   // move to tilized 3x3
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %4 = tensor.empty() : tensor<96x192xf32, #tilized_big_3x6>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %5 = "ttir.to_layout"(%3, %4) : (tensor<96x192xf32, #tilized_big_3x2>, tensor<96x192xf32, #tilized_big_3x6>) -> tensor<96x192xf32, #tilized_big_3x6>
 
   // move back to tilized 1x1
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %6 = tensor.empty() : tensor<96x192xf32, #tilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %7 = "ttir.to_layout"(%5, %6) : (tensor<96x192xf32, #tilized_big_3x6>, tensor<96x192xf32, #tilized_big>) -> tensor<96x192xf32, #tilized_big>
 
   // untilize
+  // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %8 = tensor.empty() : tensor<96x192xf32, #untilized_big>
+  // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
   %9 = "ttir.to_layout"(%7, %8) : (tensor<96x192xf32, #tilized_big>, tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big>
 
   return %9 : tensor<96x192xf32, #untilized_big>
diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
index 6b361a76da..f268e7b397 100644
--- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir
+++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -1,5 +1,4 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
-// UNSUPPORTED: true
 #l1_ = #tt.memory_space<l1>
 
 #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>

From 85c081f05479c421c876e2a21ec186f317e61049 Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Mon, 2 Sep 2024 18:18:22 -0700
Subject: [PATCH 09/16] Add system desc attribute dram_unreserved_end (#540)

Calculate the end of the DRAM region that is not usable by compiler.  This
upper region of memory is where kernel programs get allocated to.  This
calculation intends to estimate some conservative max number, but still
needs a mechanism to enforce during runtime #539.
---
 include/ttmlir/Dialect/TT/IR/TTOpsTypes.td    |  4 ++-
 include/ttmlir/Target/Common/types.fbs        |  1 +
 .../ttmlir/Target/Utils/MLIRToFlatbuffer.h    |  1 +
 lib/CAPI/TTAttrs.cpp                          |  6 ++--
 lib/Dialect/TT/IR/TTOpsTypes.cpp              |  6 ++--
 python/TTModule.cpp                           |  6 ++--
 runtime/lib/common/system_desc.cpp            | 31 +++++++++++++++++--
 7 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
index 6f55d29f62..35c6110767 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -107,6 +107,7 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {
                     "unsigned":$l1UnreservedBase,
                     "unsigned":$eriscL1UnreservedBase,
                     "unsigned":$dramUnreservedBase,
+                    "unsigned":$dramUnreservedEnd,
                     "ChipPhysicalCoresAttr":$chipPhysicalCores,
                     ArrayRefParameter<"DataTypeAttr">:$supportedDataTypes,
                     ArrayRefParameter<"TileSizeAttr">:$supportedTileSizes);
@@ -121,13 +122,14 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {
                              `l1_unreserved_base` `=` $l1UnreservedBase `,`
                              `erisc_l1_unreserved_base` `=` $eriscL1UnreservedBase `,`
                              `dram_unreserved_base` `=` $dramUnreservedBase `,`
+                             `dram_unreserved_end` `=` $dramUnreservedEnd `,`
                              `physical_cores` `=` $chipPhysicalCores `,`
                              `supported_data_types` `=` `[` $supportedDataTypes `]` `,`
                              `supported_tile_sizes` `=` `[` $supportedTileSizes `]` `}`}];
 
   let extraClassDeclaration = [{
     unsigned getUsableL1Size() const { return getL1Size() - getL1UnreservedBase(); }
-    unsigned getUsableDramChannelSize() const { return getDramChannelSize() - getDramUnreservedBase(); }
+    unsigned getUsableDramChannelSize() const { return getDramUnreservedEnd() - getDramUnreservedBase(); }
   }];
 }
 
diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs
index 42a8287611..f3d588c316 100644
--- a/include/ttmlir/Target/Common/types.fbs
+++ b/include/ttmlir/Target/Common/types.fbs
@@ -105,6 +105,7 @@ table ChipDesc {
   l1_unreserved_base: uint32;
   erisc_l1_unreserved_base: uint32;
   dram_unreserved_base: uint32;
+  dram_unreserved_end: uint32;
   physical_cores: ChipPhysicalCores;
   supported_data_types: [DataType];
   supported_tile_sizes: [Dim2d];
diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
index fa8e67466e..b56834d25a 100644
--- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
+++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -244,6 +244,7 @@ toFlatbuffer(FlatbufferObjectCache &cache, ChipDescAttr chipDesc) {
       chipDesc.getPcieAddressAlignBytes(),
       chipDesc.getNocDRAMAddressAlignBytes(), chipDesc.getL1UnreservedBase(),
       chipDesc.getEriscL1UnreservedBase(), chipDesc.getDramUnreservedBase(),
+      chipDesc.getDramUnreservedEnd(),
       toFlatbuffer(cache, chipDesc.getChipPhysicalCores()),
       toFlatbuffer(cache, chipDesc.getSupportedDataTypes()),
       toFlatbuffer(cache, chipDesc.getSupportedTileSizes()));
diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp
index 07db90b516..e3bee6e056 100644
--- a/lib/CAPI/TTAttrs.cpp
+++ b/lib/CAPI/TTAttrs.cpp
@@ -38,14 +38,14 @@ MlirAttribute ttmlirTTChipDescAttrGet(
     unsigned nocL1AddressAlignBytes, unsigned pcieAddressAlignBytes,
     unsigned nocDRAMAddressAlignBytes, unsigned l1UnreservedBase,
     unsigned eriscL1UnreservedBase, unsigned dramUnreservedBase,
-    MlirAttribute chipPhysicalCores, MlirAttribute *supportedDataTypes,
-    MlirAttribute *supportedTileSizes) {
+    unsigned dramUnreservedEnd, MlirAttribute chipPhysicalCores,
+    MlirAttribute *supportedDataTypes, MlirAttribute *supportedTileSizes) {
   std::vector<int64_t> gridVec(grid, grid + gridSize);
   return wrap(ChipDescAttr::get(
       unwrap(ctx), mlir::dyn_cast<ArchAttr>(unwrap(arch)), gridVec, l1Size,
       numDramChannels, dramChannelSize, nocL1AddressAlignBytes,
       pcieAddressAlignBytes, nocDRAMAddressAlignBytes, l1UnreservedBase,
-      eriscL1UnreservedBase, dramUnreservedBase,
+      eriscL1UnreservedBase, dramUnreservedBase, dramUnreservedEnd,
       mlir::dyn_cast<ChipPhysicalCoresAttr>(unwrap(chipPhysicalCores)),
       mlir::dyn_cast<DataTypeAttr>(unwrap(*supportedDataTypes)),
       mlir::dyn_cast<TileSizeAttr>(unwrap(*supportedTileSizes))));
diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
index d760b012eb..35f48177c6 100644
--- a/lib/Dialect/TT/IR/TTOpsTypes.cpp
+++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -84,7 +84,7 @@ mlir::tt::SystemDescAttr::getDefault(MLIRContext *context) {
       {
           tt::ChipDescAttr::get(
               context, tt::ArchAttr::get(context, tt::Arch::WormholeB0),
-              gridShape, 1499136, 12, (1 << 30), 16, 32, 32, 0, 0, 0,
+              gridShape, 1499136, 12, (1 << 30), 16, 32, 32, 0, 0, 0, (1 << 30),
               tt::ChipPhysicalCoresAttr::get(context, workerCores, dramCores,
                                              {}, {}),
               supported_data_types, supported_tile_sizes),
@@ -242,8 +242,8 @@ mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) {
         element->pcie_address_align_bytes(),
         element->noc_dram_address_align_bytes(), element->l1_unreserved_base(),
         element->erisc_l1_unreserved_base(), element->dram_unreserved_base(),
-        chip_physical_cores_attr, supported_data_types_attr,
-        supported_tile_sizes_attr);
+        element->dram_unreserved_end(), chip_physical_cores_attr,
+        supported_data_types_attr, supported_tile_sizes_attr);
     chip_desc_list.push_back(current_chip_desc_attr);
   }
 
diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index 8ebf4c9a74..1e3841deac 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -124,15 +124,15 @@ void populateTTModule(py::module &m) {
              unsigned dramChannelSize, unsigned nocL1AddressAlignBytes,
              unsigned pcieAddressAlignBytes, unsigned nocDRAMAddressAlignBytes,
              unsigned l1UnreservedBase, unsigned eriscL1UnreservedBase,
-             unsigned dramUnreservedBase, MlirAttribute chipPhysicalCores,
-             MlirAttribute supportedDataTypes,
+             unsigned dramUnreservedBase, unsigned dramUnreservedEnd,
+             MlirAttribute chipPhysicalCores, MlirAttribute supportedDataTypes,
              MlirAttribute supportedTileSizes) {
             return wrap(tt::ChipDescAttr::get(
                 unwrap(ctx), mlir::cast<tt::ArchAttr>(unwrap(arch)), grid,
                 l1Size, numDramChannels, dramChannelSize,
                 nocL1AddressAlignBytes, pcieAddressAlignBytes,
                 nocDRAMAddressAlignBytes, l1UnreservedBase,
-                eriscL1UnreservedBase, dramUnreservedBase,
+                eriscL1UnreservedBase, dramUnreservedBase, dramUnreservedEnd,
                 mlir::dyn_cast<tt::ChipPhysicalCoresAttr>(
                     unwrap(chipPhysicalCores)),
                 mlir::cast<tt::DataTypeAttr>(unwrap(supportedDataTypes)),
diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp
index 2e84fe8ef2..091b193397 100644
--- a/runtime/lib/common/system_desc.cpp
+++ b/runtime/lib/common/system_desc.cpp
@@ -148,6 +148,31 @@ createChipPhysicalCores(const ::tt::tt_metal::Device *device,
       fbb.CreateVectorOfStructs(eth_inactive_cores));
 }
 
+// Calculate the end of the DRAM region that is not usable by compiler.  This
+// upper region of memory is where kernel programs get allocated to.  This
+// function intends to estimate some conservative max number.
+static std::uint32_t
+calculateDRAMUnreservedEnd(const ::tt::tt_metal::Device *device) {
+  CoreCoord deviceGridSize = device->logical_grid_size();
+  CoreCoord dramGridSize = device->dram_grid_size();
+  std::uint32_t totalCores = deviceGridSize.x * deviceGridSize.y +
+                             device->get_active_ethernet_cores().size();
+  std::uint32_t totalDramCores = dramGridSize.x * dramGridSize.y;
+  std::uint32_t programCarveOutPerCore = L1_UNRESERVED_BASE;
+  std::uint32_t totalProgramCarveOut = programCarveOutPerCore * totalCores;
+  // The total carve out can be interleaved between all dram channels
+  std::uint32_t programCarveOutDramSpace =
+      (totalProgramCarveOut + totalDramCores - 1) / totalDramCores;
+  static_assert(DRAM_ALIGNMENT > 0);
+  static_assert((DRAM_ALIGNMENT & (DRAM_ALIGNMENT - 1)) == 0);
+  assert(programCarveOutDramSpace < device->dram_size_per_channel());
+  std::uint32_t dramUnreservedEnd =
+      device->dram_size_per_channel() - programCarveOutDramSpace;
+  // Align to DRAM_ALIGNMENT
+  dramUnreservedEnd = dramUnreservedEnd & ~(DRAM_ALIGNMENT - 1);
+  return dramUnreservedEnd;
+}
+
 static std::unique_ptr<::tt::runtime::SystemDesc>
 getCurrentSystemDescImpl(const ::tt::tt_metal::DeviceMesh &deviceMesh) {
   std::vector<::tt::tt_metal::Device *> devices = deviceMesh.get_devices();
@@ -192,13 +217,15 @@ getCurrentSystemDescImpl(const ::tt::tt_metal::DeviceMesh &deviceMesh) {
     auto supportedTileSizes =
         fbb.CreateVectorOfStructs(supportedTileSizesVector);
 
+    auto dramUnreservedEnd = calculateDRAMUnreservedEnd(device);
+
     chipDescs.push_back(::tt::target::CreateChipDesc(
         fbb, toFlatbuffer(device->arch()), &deviceGrid,
         device->l1_size_per_core(), device->num_dram_channels(),
         device->dram_size_per_channel(), L1_ALIGNMENT, PCIE_ALIGNMENT,
         DRAM_ALIGNMENT, L1_UNRESERVED_BASE, ERISC_L1_UNRESERVED_BASE,
-        DRAM_UNRESERVED_BASE, chipPhysicalCores, supportedDataTypes,
-        supportedTileSizes));
+        DRAM_UNRESERVED_BASE, dramUnreservedEnd, chipPhysicalCores,
+        supportedDataTypes, supportedTileSizes));
     chipDescIndices.push_back(device->id());
     // Derive chip capability
     ::tt::target::ChipCapability chipCapability =

From 5d60c17023ca71ba8b795b3b0a9647d332f4930b Mon Sep 17 00:00:00 2001
From: Radenko Pavlovic <133032400+rpavlovicTT@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:57:48 +0200
Subject: [PATCH 10/16] Refactoring in TTMetal dialect (#578)

This commit refactors:
1. Dialect conversion from TTKernel to EmitC.
2. Serialization of TTMetal IR to flatbuffer binary.

1. Implement dialect conversion from TTKernel to EmitC

TTKernel dialect that can be found nested in TTMetal ops can now be
converted via 'convert-ttkernel-to-emitc' pass. Pass is registered as a
func::FuncOp pass so the kernel must be put inside a function before
conversion.
When serializing ttmetal IR to binary, we call this conversion for every
region of a ttmetal dispatch op.

FileCheck UT is added.

2. Translate TTMetal to flatbuffer

Serialization to flatbuffer binary is now a proper translation pass that
can be run with:

ttmlir-translate --ttmetal-to-flatbuffer ttmetal.mlir
---
 include/ttmlir/Conversion/Passes.h            |   1 +
 include/ttmlir/Conversion/Passes.td           |   6 +
 .../TTKernelToEmitC/TTKernelToEmitC.h         |  38 ++
 .../Dialect/TTMetal/Transforms/KernelsToCpp.h |  18 -
 .../Dialect/TTMetal/Transforms/Passes.td      |  11 -
 .../Target/TTMetal/TTMetalToFlatbuffer.h      |  19 +
 include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h |   4 +-
 lib/Conversion/CMakeLists.txt                 |   7 +-
 lib/Conversion/TTKernelToEmitC/CMakeLists.txt |  12 +
 .../TTKernelToEmitC/TTKernelToEmitC.cpp       | 414 ++++++++++++++++++
 lib/Conversion/TosaToTTIR/TosaToTTIR.cpp      |  15 +-
 lib/Dialect/TTKernel/IR/TTKernelOps.cpp       |   2 +-
 lib/Dialect/TTMetal/Transforms/CMakeLists.txt |   2 -
 .../TTMetal/Transforms/KernelsToCpp.cpp       | 327 --------------
 .../TTMetal/Transforms/SerializeToBinary.cpp  | 295 -------------
 lib/SharedLib/CMakeLists.txt                  |   1 +
 lib/Target/CMakeLists.txt                     |   1 +
 lib/Target/TTMetal/CMakeLists.txt             |  17 +
 lib/Target/TTMetal/TTMetalToFlatbuffer.cpp    | 289 ++++++++++++
 .../TTMetalToFlatbufferRegistration.cpp       |  29 ++
 .../Conversion/TTKernelToEmitC/ttkernel.mlir  |  50 +++
 .../Silicon/TTMetal/simple_eltwise.mlir       |   2 +-
 .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir |   2 +-
 test/ttmlir/Silicon/TTMetal/to_layout.mlir    |   2 +-
 tools/ttmlir-translate/CMakeLists.txt         |   2 +-
 tools/ttmlir-translate/ttmlir-translate.cpp   |   7 +
 26 files changed, 903 insertions(+), 670 deletions(-)
 create mode 100644 include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h
 delete mode 100644 include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h
 create mode 100644 include/ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h
 create mode 100644 lib/Conversion/TTKernelToEmitC/CMakeLists.txt
 create mode 100644 lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
 delete mode 100644 lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
 delete mode 100644 lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
 create mode 100644 lib/Target/TTMetal/CMakeLists.txt
 create mode 100644 lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
 create mode 100644 lib/Target/TTMetal/TTMetalToFlatbufferRegistration.cpp
 create mode 100644 test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir

diff --git a/include/ttmlir/Conversion/Passes.h b/include/ttmlir/Conversion/Passes.h
index b12e9ebb83..7750486686 100644
--- a/include/ttmlir/Conversion/Passes.h
+++ b/include/ttmlir/Conversion/Passes.h
@@ -9,6 +9,7 @@
 #include "ttmlir/Conversion/StableHLOToTTIR/StableHLOToTTIR.h"
 #endif
 #include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
+#include "ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h"
 #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
 #include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIR.h"
diff --git a/include/ttmlir/Conversion/Passes.td b/include/ttmlir/Conversion/Passes.td
index 4925e61cc4..92926dbfdb 100644
--- a/include/ttmlir/Conversion/Passes.td
+++ b/include/ttmlir/Conversion/Passes.td
@@ -34,4 +34,10 @@ def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::ModuleOp"> {
   let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::tt::ttnn::TTNNDialect"];
 }
 
+def ConvertTTKernelToEmitC : Pass<"convert-ttkernel-to-emitc", "::func::FuncOp"> {
+  let summary = "Convert TTKernel dialect to EmitC dialect.";
+  let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::func::FuncDialect",
+                           "mlir::tt::ttkernel::TTKernelDialect"];
+}
+
 #endif // TTMLIR_CONVERSION_PASSES
diff --git a/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h b/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h
new file mode 100644
index 0000000000..57592eb3ab
--- /dev/null
+++ b/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H
+#define TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h"
+#include <llvm/ADT/SmallVector.h>
+
+namespace mlir::tt {
+#define GEN_PASS_DECL_CONVERTTTKERNELTOEMITC
+#include "ttmlir/Conversion/Passes.h.inc"
+
+// Runs a conversion pass to EmitC dialect on a func op containing given
+// region's body. Also, it adds boilerplate code such as includes and namespace
+// declarations.
+LogicalResult
+convertTTKernelRegionToEmitC(OpBuilder &builder, Region *region,
+                             const ttkernel::ThreadTypeAttr &threadType);
+
+// Converts given region to EmitC dialect and translates it to C++ code.
+LogicalResult
+emitDispatchOpRegionAsCpp(Region *region, std::string &regionCpp,
+                          const ttkernel::ThreadTypeAttr &threadType);
+
+// Converts dispatch op's regions to C++ code.
+LogicalResult
+emitDispatchOpRegionsAsCpp(ttmetal::DispatchOp dispatchOp,
+                           llvm::SmallVector<std::string> &cppStrings);
+
+} // namespace mlir::tt
+
+#endif
diff --git a/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h b/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h
deleted file mode 100644
index 18c1a9ef6e..0000000000
--- a/include/ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_DIALECT_TTMETAL_TRANSFORMS_KERNELSTOCPP_H
-#define TTMLIR_DIALECT_TTMETAL_TRANSFORMS_KERNELSTOCPP_H
-
-#include "mlir/Support/LogicalResult.h"
-
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
-#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h"
-
-namespace mlir::tt::ttmetal {
-LogicalResult emitDispatchOpRegionAsCpp(DispatchOp dispatchOp,
-                                        unsigned regionNumber,
-                                        llvm::raw_ostream &os);
-} // namespace mlir::tt::ttmetal
-#endif
diff --git a/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td b/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td
index ee6f024084..e321db93a3 100644
--- a/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTMetal/Transforms/Passes.td
@@ -14,15 +14,4 @@ def ConvertTTIRToTTMetal: Pass<"convert-ttir-to-ttmetal", "::mlir::ModuleOp"> {
   }];
 }
 
-def TTMetalSerializeToBinary: Pass<"ttmetal-serialize-to-binary", "::mlir::ModuleOp"> {
-  let summary = "";
-  let description = [{
-    todo
-  }];
-
-  list<Option> options = [
-    Option<"output", "output", "std::string", "", "Output binary path">,
-  ];
-}
-
 #endif
diff --git a/include/ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h b/include/ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h
new file mode 100644
index 0000000000..33908ebd8c
--- /dev/null
+++ b/include/ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_TARGET_TTMETAL_TTMETALTOFLATBUFFER_H
+#define TTMLIR_TARGET_TTMETAL_TTMETALTOFLATBUFFER_H
+
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir::tt::ttmetal {
+
+// Translates a TTMetal operation to a flatbuffer and writes it to the given
+// stream.
+LogicalResult translateTTMetalToFlatbuffer(Operation *op,
+                                           llvm::raw_ostream &os);
+} // namespace mlir::tt::ttmetal
+
+#endif
diff --git a/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h b/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h
index 202a6a7745..bc2009dd44 100644
--- a/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h
+++ b/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef TTMLIR_TARGET_UTILS_TTNNTOFLATBUFFER_H
-#define TTMLIR_TARGET_UTILS_TTNNTOFLATBUFFER_H
+#ifndef TTMLIR_TARGET_TTNN_TTNNTOFLATBUFFER_H
+#define TTMLIR_TARGET_TTNN_TTNNTOFLATBUFFER_H
 
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/LogicalResult.h"
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 7b27dc44a5..ad45c43b01 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(TTMLIRConversions INTERFACE)
 add_subdirectory(TosaToTTIR)
 add_subdirectory(TTNNToEmitC)
 add_subdirectory(TTIRToTTNN)
+add_subdirectory(TTKernelToEmitC)
+
 if (TTMLIR_ENABLE_STABLEHLO)
 add_subdirectory(StableHLOToTTIR)
 endif()
@@ -10,9 +12,10 @@ endif()
 include_directories(${TTMLIR_SOURCE_DIR}/include)
 
 set(link_libs
-TTMLIRTosaToTTIR;
-TTMLIRTTNNToEmitC;
+TTMLIRTosaToTTIR
+TTMLIRTTNNToEmitC
 TTMLIRTTIRToTTNN
+TTMLIRTTKernelToEmitC
 )
 
 if (TTMLIR_ENABLE_STABLEHLO)
diff --git a/lib/Conversion/TTKernelToEmitC/CMakeLists.txt b/lib/Conversion/TTKernelToEmitC/CMakeLists.txt
new file mode 100644
index 0000000000..a509936624
--- /dev/null
+++ b/lib/Conversion/TTKernelToEmitC/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_library(TTMLIRTTKernelToEmitC
+  TTKernelToEmitC.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/include/ttmlir/Conversion/TTKernelToEmitC
+
+  DEPENDS
+  TTMLIRConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIR
+)
diff --git a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
new file mode 100644
index 0000000000..79caf25327
--- /dev/null
+++ b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
@@ -0,0 +1,414 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetal.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h"
+
+#include "mlir/Conversion/ArithToEmitC/ArithToEmitC.h"
+#include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/Cpp/CppEmitter.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace tt;
+
+namespace mlir::tt::ttkernel {
+
+#define GEN_PASS_DEF_CONVERTTTKERNELTOEMITC
+#include "ttmlir/Conversion/Passes.h.inc"
+
+} // namespace mlir::tt::ttkernel
+
+emitc::OpaqueAttr convertCBPort(Builder &builder, ttkernel::CBPort port) {
+  switch (port) {
+  case ttkernel::CBPort::In0:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in0");
+  case ttkernel::CBPort::In1:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in1");
+  case ttkernel::CBPort::In2:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in2");
+  case ttkernel::CBPort::In3:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in3");
+  case ttkernel::CBPort::In4:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in4");
+  case ttkernel::CBPort::In5:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in5");
+  case ttkernel::CBPort::In6:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in6");
+  case ttkernel::CBPort::In7:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in7");
+  case ttkernel::CBPort::DataFlow0:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow0");
+  case ttkernel::CBPort::DataFlow1:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow1");
+  case ttkernel::CBPort::DataFlow2:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow2");
+  case ttkernel::CBPort::DataFlow3:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow3");
+  case ttkernel::CBPort::DataFlow4:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow4");
+  case ttkernel::CBPort::DataFlow5:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow5");
+  case ttkernel::CBPort::DataFlow6:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow6");
+  case ttkernel::CBPort::DataFlow7:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow7");
+  case ttkernel::CBPort::Out0:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out0");
+  case ttkernel::CBPort::Out1:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out1");
+  case ttkernel::CBPort::Out2:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out2");
+  case ttkernel::CBPort::Out3:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out3");
+  case ttkernel::CBPort::Out4:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out4");
+  case ttkernel::CBPort::Out5:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out5");
+  case ttkernel::CBPort::Out6:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out6");
+  case ttkernel::CBPort::Out7:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out7");
+  case ttkernel::CBPort::Intermed0:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed0");
+  case ttkernel::CBPort::Intermed1:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed1");
+  case ttkernel::CBPort::Intermed2:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed2");
+  case ttkernel::CBPort::Intermed3:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed3");
+  case ttkernel::CBPort::Intermed4:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed4");
+  case ttkernel::CBPort::Intermed5:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed5");
+  case ttkernel::CBPort::Intermed6:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed6");
+  case ttkernel::CBPort::Intermed7:
+    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed7");
+  }
+  llvm_unreachable("Unknown CBPort");
+  return nullptr;
+}
+
+class TTKernelToEmitCTypeConverter : public TypeConverter {
+public:
+  TTKernelToEmitCTypeConverter(MLIRContext *ctx) {
+    addConversion([](Type type) { return type; });
+    addConversion([ctx](mlir::tt::ttkernel::NocAddrType type) -> Type {
+      return Builder(ctx).getI64Type();
+    });
+    addConversion([ctx](mlir::tt::ttkernel::CBType type) -> Type {
+      return Builder(ctx).getType<emitc::OpaqueType>("::tt::CB");
+    });
+  }
+};
+
+class TTMetalToEmitCFuncArgsRewriter
+    : public OpConversionPattern<func::FuncOp> {
+public:
+  TTMetalToEmitCFuncArgsRewriter(TTKernelToEmitCTypeConverter &typeConverter,
+                                 MLIRContext *ctx)
+      : OpConversionPattern<func::FuncOp>(typeConverter, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(func::FuncOp op, func::FuncOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto blockArgs = op.getCallableRegion()->getArguments();
+    if (blockArgs.empty()) {
+      return rewriter.notifyMatchFailure(op, "No block arguments");
+    }
+    rewriter.startOpModification(op);
+    rewriter.setInsertionPointToStart(&op.getCallableRegion()->front());
+    for (auto arg : blockArgs) {
+      auto cb = cast<ttkernel::CBType>(arg.getType());
+      auto cbType = getTypeConverter()->convertType(cb);
+      auto var = rewriter.create<emitc::VariableOp>(
+          op.getLoc(), cbType, convertCBPort(rewriter, cb.getPort()));
+      arg.replaceAllUsesWith(var);
+    }
+    op.getCallableRegion()->front().eraseArguments(0, blockArgs.size());
+    op.setType(rewriter.getType<FunctionType>(TypeRange(), TypeRange()));
+    rewriter.finalizeOpModification(op);
+
+    return success();
+  }
+};
+
+class TTMetalToEmitCReturnRewriter
+    : public OpConversionPattern<ttkernel::ReturnOp> {
+public:
+  TTMetalToEmitCReturnRewriter(TTKernelToEmitCTypeConverter &typeConverter,
+                               MLIRContext *ctx)
+      : OpConversionPattern<ttkernel::ReturnOp>(typeConverter, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(ttkernel::ReturnOp op, ttkernel::ReturnOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    if (!isa<func::FuncOp>(op.getOperation()->getParentOp())) {
+      return rewriter.notifyMatchFailure(op, "Not inside of func op");
+    }
+    rewriter.replaceOpWithNewOp<func::ReturnOp>(op, ValueRange());
+    return success();
+  }
+};
+
+template <typename SourceOp, typename Adaptor = typename SourceOp::Adaptor>
+class TTMetalToEmitCOpaqueRewriter : public OpConversionPattern<SourceOp> {
+public:
+  TTMetalToEmitCOpaqueRewriter(TTKernelToEmitCTypeConverter &typeConverter,
+                               MLIRContext *ctx)
+      : OpConversionPattern<SourceOp>(typeConverter, ctx) {}
+
+  StringRef getOpName(SourceOp op) const {
+    if constexpr (std::is_same_v<SourceOp, ttkernel::BuiltinOp>) {
+      return op.getOp();
+    }
+    auto name = op.getOperation()->getName().getStringRef();
+    if (name.starts_with("ttkernel.")) {
+      return name.drop_front(9);
+    }
+    return name;
+  }
+
+  LogicalResult
+  matchAndRewrite(SourceOp op, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    SmallVector<Type, 4> resultTypes;
+    for (Type type : op->getResultTypes()) {
+      Type ct = this->getTypeConverter()->convertType(type);
+      if (!ct) {
+        return rewriter.notifyMatchFailure(op, "Failed to convert type ");
+      }
+      resultTypes.push_back(ct);
+    }
+    rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
+        op, resultTypes, getOpName(op), nullptr, nullptr,
+        adaptor.getOperands());
+    return success();
+  }
+};
+
+class ConvertTTKernelToEmitCPass
+    : public ttkernel::impl::ConvertTTKernelToEmitCBase<
+          ConvertTTKernelToEmitCPass> {
+public:
+  using ConvertTTKernelToEmitCBase<
+      ConvertTTKernelToEmitCPass>::ConvertTTKernelToEmitCBase;
+
+  void runOnOperation() override {
+    auto funcOp = getOperation();
+
+    // Apply arith to emitc conversion first
+    {
+      ConversionTarget target(*funcOp.getContext());
+      target.addLegalDialect<emitc::EmitCDialect>();
+      target.addIllegalDialect<arith::ArithDialect>();
+      RewritePatternSet arithPatterns(funcOp.getContext());
+      TypeConverter arithTypeConverter;
+      arithTypeConverter.addConversion([](Type type) { return type; });
+      populateArithToEmitCPatterns(arithTypeConverter, arithPatterns);
+      if (failed(applyPartialConversion(funcOp, target,
+                                        std::move(arithPatterns)))) {
+        signalPassFailure();
+        return;
+      }
+    }
+
+    // Apply scf to emitc conversion next
+    {
+      ConversionTarget target(*funcOp.getContext());
+      target.addLegalDialect<emitc::EmitCDialect>();
+      target.addIllegalDialect<scf::SCFDialect>();
+      RewritePatternSet scfPatterns(funcOp.getContext());
+      populateSCFToEmitCConversionPatterns(scfPatterns);
+      if (failed(
+              applyPartialConversion(funcOp, target, std::move(scfPatterns)))) {
+        signalPassFailure();
+        return;
+      }
+    }
+
+    {
+      TTKernelToEmitCTypeConverter typeConverter(funcOp.getContext());
+      RewritePatternSet patterns(funcOp.getContext());
+      ConversionTarget target(*funcOp.getContext());
+      target.addLegalDialect<emitc::EmitCDialect>();
+      target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) -> bool {
+        // Converting func op (kernel main) will result it having 0
+        // arguments. At that point it becomes legal.
+        return op.getNumArguments() == 0;
+      });
+      target.addLegalOp<func::ReturnOp>();
+      target.addIllegalDialect<ttkernel::TTKernelDialect>();
+
+      patterns
+          .add<TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::BuiltinOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsAcquireOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsCommitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsWaitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsReleaseOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::PackTileOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::CBPushBackOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::CBPopFrontOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::CBReserveBackOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::CBWaitFrontOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeInitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeInitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeBlockOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeBlockOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::BinaryOpInitCommonOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesInitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadBarrierOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteOp>,
+               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteBarrierOp>>(
+              typeConverter, funcOp.getContext());
+
+      if (failed(applyFullConversion(funcOp, target, std::move(patterns)))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+
+namespace mlir::tt {
+
+std::unique_ptr<::mlir::Pass> createConvertTTKernelToEmitC() {
+  return std::make_unique<ConvertTTKernelToEmitCPass>();
+}
+
+// Class used to add includes and other boilerplate code to the generated
+// kernel.
+class ThreadConfigHelper {
+public:
+  ThreadConfigHelper(OpBuilder *builder, Location loc,
+                     ttkernel::ThreadTypeAttr threadType)
+      : builder(builder), loc(loc), threadType(threadType) {
+    builder->create<emitc::IncludeOp>(loc, "cstdint",
+                                      /*isStandard=*/true);
+    if (threadType.getValue() == ttkernel::ThreadType::Noc0 ||
+        threadType.getValue() == ttkernel::ThreadType::Noc1) {
+
+      builder->create<emitc::IncludeOp>(loc, "dataflow_api.h",
+                                        /*isStandard=*/false);
+    }
+    if (threadType.getValue() == ttkernel::ThreadType::Tensix) {
+      builder->create<emitc::IncludeOp>(loc, "compute_kernel_api/common.h",
+                                        /*isStandard=*/false);
+      builder->create<emitc::IncludeOp>(loc, "compute_kernel_api/tilize.h",
+                                        /*isStandard=*/false);
+      builder->create<emitc::IncludeOp>(loc, "compute_kernel_api/untilize.h",
+                                        /*isStandard=*/false);
+      builder->create<emitc::IncludeOp>(loc,
+                                        "compute_kernel_api/eltwise_binary.h",
+                                        /*isStandard=*/false);
+      builder->create<emitc::VerbatimOp>(loc, "namespace NAMESPACE {");
+    }
+  }
+
+  ~ThreadConfigHelper() {
+    if (threadType.getValue() == ttkernel::ThreadType::Tensix) {
+      builder->create<emitc::VerbatimOp>(loc, "void MAIN { kernel_main(); }");
+      builder->create<emitc::VerbatimOp>(loc, "}"); // close namespace NAMESPACE
+    }
+  }
+
+private:
+  OpBuilder *builder;
+  Location loc;
+  ttkernel::ThreadTypeAttr threadType;
+};
+
+LogicalResult
+convertTTKernelRegionToEmitC(OpBuilder &builder, Region *region,
+                             const ttkernel::ThreadTypeAttr &threadType) {
+  ThreadConfigHelper threadConfigHelper(&builder, region->getLoc(), threadType);
+
+  auto funcOp = builder.create<func::FuncOp>(
+      region->getLoc(), "kernel_main",
+      builder.getType<FunctionType>(region->getArgumentTypes(), TypeRange()));
+
+  IRMapping irMapper;
+  region->cloneInto(&funcOp.getBody(), irMapper);
+
+  auto pm = PassManager::on<func::FuncOp>(region->getContext());
+  pm.addPass(createConvertTTKernelToEmitC());
+
+  if (pm.run(funcOp).failed()) {
+    return failure();
+  }
+
+  return success();
+}
+
+LogicalResult
+emitDispatchOpRegionAsCpp(Region *region, std::string &regionCpp,
+                          const ttkernel::ThreadTypeAttr &threadType) {
+  OpBuilder builder(region->getContext());
+
+  // We will wrap everything in a module op so that we can run the translation.
+  auto moduleWrapper =
+      builder.create<mlir::ModuleOp>(region->getLoc(), "module_wrapper");
+  builder.setInsertionPointToStart(moduleWrapper.getBody());
+
+  if (convertTTKernelRegionToEmitC(builder, region, threadType).failed()) {
+    return failure();
+  }
+
+  llvm::raw_string_ostream os(regionCpp);
+  if (emitc::translateToCpp(moduleWrapper, os).failed()) {
+    return failure();
+  }
+
+  return success();
+}
+
+LogicalResult
+emitDispatchOpRegionsAsCpp(ttmetal::DispatchOp dispatchOp,
+                           llvm::SmallVector<std::string> &cppStrings) {
+  assert(cppStrings.size() == dispatchOp.getNumRegions() &&
+         "cppStrings size must match number of regions");
+
+  // We must load the EmitC dialect before we can emit any EmitC code. This
+  // dialect won't be loaded by MLIR until pass manager starts a pass that
+  // depends on it. Because we want to emit EmitC code before that, we need to
+  // load it here.
+  dispatchOp.getContext()->getOrLoadDialect<emitc::EmitCDialect>();
+
+  for (auto &reg : dispatchOp->getRegions()) {
+    auto threadType = mlir::cast<ttkernel::ThreadTypeAttr>(
+        dispatchOp.getThreadTypes()[reg.getRegionNumber()]);
+    if (emitDispatchOpRegionAsCpp(&reg, cppStrings[reg.getRegionNumber()],
+                                  threadType)
+            .failed()) {
+      return llvm::failure();
+    }
+  }
+
+  return success();
+}
+
+} // namespace mlir::tt
diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
index fc97f9771c..c6faed14af 100644
--- a/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
@@ -8,19 +8,18 @@
 #include "ttmlir/Dialect/TTIR/IR/TTIR.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include <mlir/Dialect/Func/IR/FuncOps.h>
-#include <mlir/Dialect/Tensor/IR/Tensor.h>
-#include <mlir/IR/BuiltinAttributes.h>
-#include <mlir/IR/BuiltinOps.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/PatternMatch.h>
-#include <mlir/IR/ValueRange.h>
-#include <mlir/Support/LogicalResult.h>
 
 using namespace mlir;
 using namespace tt;
diff --git a/lib/Dialect/TTKernel/IR/TTKernelOps.cpp b/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
index 7e6bdbe0fa..be3c6e1e64 100644
--- a/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
+++ b/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
@@ -20,7 +20,7 @@ static bool insideDispatchOpRegion(mlir::Operation *op) {
   if (dyn_cast_or_null<ttmetal::DispatchOp>(parentOp)) {
     return true;
   }
-  if (dyn_cast_or_null<func::FuncOp>(parentOp) and
+  if (dyn_cast_or_null<func::FuncOp>(parentOp) &&
       dyn_cast_or_null<mlir::ModuleOp>(parentOp->getParentOp())) {
     return true;
   }
diff --git a/lib/Dialect/TTMetal/Transforms/CMakeLists.txt b/lib/Dialect/TTMetal/Transforms/CMakeLists.txt
index 9855bfed93..2c22612e3d 100644
--- a/lib/Dialect/TTMetal/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTMetal/Transforms/CMakeLists.txt
@@ -1,7 +1,5 @@
 add_mlir_dialect_library(MLIRTTMetalTransforms
         Passes.cpp
-        KernelsToCpp.cpp
-        SerializeToBinary.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
diff --git a/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp b/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
deleted file mode 100644
index bee6eb0e01..0000000000
--- a/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "llvm/ADT/ScopeExit.h"
-
-#include "mlir/Conversion/ArithToEmitC/ArithToEmitC.h"
-#include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Rewrite/FrozenRewritePatternSet.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Target/Cpp/CppEmitter.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "ttmlir/Dialect/TT/IR/TT.h"
-#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-
-#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
-#include "ttmlir/Dialect/TTMetal/IR/TTMetalOpsTypes.h"
-#include "ttmlir/Dialect/TTMetal/Transforms/Passes.h"
-
-namespace mlir::tt::ttmetal {
-
-emitc::OpaqueAttr convertCBPort(Builder &builder, ttkernel::CBPort port) {
-  switch (port) {
-  case ttkernel::CBPort::In0:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in0");
-  case ttkernel::CBPort::In1:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in1");
-  case ttkernel::CBPort::In2:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in2");
-  case ttkernel::CBPort::In3:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in3");
-  case ttkernel::CBPort::In4:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in4");
-  case ttkernel::CBPort::In5:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in5");
-  case ttkernel::CBPort::In6:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in6");
-  case ttkernel::CBPort::In7:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_in7");
-  case ttkernel::CBPort::DataFlow0:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow0");
-  case ttkernel::CBPort::DataFlow1:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow1");
-  case ttkernel::CBPort::DataFlow2:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow2");
-  case ttkernel::CBPort::DataFlow3:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow3");
-  case ttkernel::CBPort::DataFlow4:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow4");
-  case ttkernel::CBPort::DataFlow5:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow5");
-  case ttkernel::CBPort::DataFlow6:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow6");
-  case ttkernel::CBPort::DataFlow7:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::dataflow7");
-  case ttkernel::CBPort::Out0:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out0");
-  case ttkernel::CBPort::Out1:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out1");
-  case ttkernel::CBPort::Out2:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out2");
-  case ttkernel::CBPort::Out3:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out3");
-  case ttkernel::CBPort::Out4:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out4");
-  case ttkernel::CBPort::Out5:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out5");
-  case ttkernel::CBPort::Out6:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out6");
-  case ttkernel::CBPort::Out7:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_out7");
-  case ttkernel::CBPort::Intermed0:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed0");
-  case ttkernel::CBPort::Intermed1:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed1");
-  case ttkernel::CBPort::Intermed2:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed2");
-  case ttkernel::CBPort::Intermed3:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed3");
-  case ttkernel::CBPort::Intermed4:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed4");
-  case ttkernel::CBPort::Intermed5:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed5");
-  case ttkernel::CBPort::Intermed6:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed6");
-  case ttkernel::CBPort::Intermed7:
-    return builder.getType<emitc::OpaqueAttr>("::tt::CB::c_intermed7");
-  }
-  llvm_unreachable("Unknown CBPort");
-  return nullptr;
-}
-
-class TTKernelToEmitCTypeConverter : public TypeConverter {
-public:
-  TTKernelToEmitCTypeConverter(MLIRContext *ctx) {
-    addConversion([](Type type) { return type; });
-    addConversion([ctx](mlir::tt::ttkernel::NocAddrType type) -> Type {
-      return Builder(ctx).getI64Type();
-    });
-    addConversion([ctx](mlir::tt::ttkernel::CBType type) -> Type {
-      return Builder(ctx).getType<emitc::OpaqueType>("::tt::CB");
-    });
-  }
-};
-
-class TTMetalToEmitCFuncArgsRewriter : public OpRewritePattern<func::FuncOp> {
-public:
-  TTMetalToEmitCFuncArgsRewriter(TTKernelToEmitCTypeConverter &typeConverter,
-                                 MLIRContext *ctx)
-      : OpRewritePattern<func::FuncOp>(ctx), typeConverter(&typeConverter) {}
-
-  LogicalResult matchAndRewrite(func::FuncOp op,
-                                PatternRewriter &rewriter) const final {
-    auto blockArgs = op.getCallableRegion()->getArguments();
-    if (blockArgs.empty()) {
-      return rewriter.notifyMatchFailure(op, "No block arguments");
-    }
-
-    // Rewrite the block arguments to be variables.
-    rewriter.setInsertionPointToStart(&op.getCallableRegion()->front());
-    for (auto arg : blockArgs) {
-      auto cb = cast<ttkernel::CBType>(arg.getType());
-      auto cbType = typeConverter->convertType(cb);
-      auto var = rewriter.create<emitc::VariableOp>(
-          op.getLoc(), cbType, convertCBPort(rewriter, cb.getPort()));
-      arg.replaceAllUsesWith(var);
-    }
-    op.getCallableRegion()->front().eraseArguments(0, blockArgs.size());
-    op.setType(rewriter.getType<FunctionType>(TypeRange(), TypeRange()));
-
-    return success();
-  }
-
-  TTKernelToEmitCTypeConverter *typeConverter;
-};
-
-class TTMetalToEmitCReturnRewriter
-    : public OpRewritePattern<ttkernel::ReturnOp> {
-public:
-  TTMetalToEmitCReturnRewriter(TTKernelToEmitCTypeConverter &, MLIRContext *ctx)
-      : OpRewritePattern<ttkernel::ReturnOp>(ctx) {}
-
-  LogicalResult matchAndRewrite(ttkernel::ReturnOp op,
-                                PatternRewriter &rewriter) const final {
-    if (not isa<func::FuncOp>(op.getOperation()->getParentOp())) {
-      return rewriter.notifyMatchFailure(op, "Not inside of func op");
-    }
-    rewriter.replaceOpWithNewOp<func::ReturnOp>(op, ValueRange());
-    return success();
-  }
-};
-
-template <typename OpTy>
-class TTMetalToEmitCOpaqueRewriter : public OpRewritePattern<OpTy> {
-public:
-  TTMetalToEmitCOpaqueRewriter(TTKernelToEmitCTypeConverter &typeConverter,
-                               MLIRContext *ctx)
-      : OpRewritePattern<OpTy>(ctx), typeConverter(&typeConverter) {}
-
-  StringRef getOpName(OpTy op) const {
-    if constexpr (std::is_same_v<OpTy, ttkernel::BuiltinOp>) {
-      return op.getOp();
-    }
-    auto name = op.getOperation()->getName().getStringRef();
-    if (name.starts_with("ttkernel.")) {
-      return name.drop_front(9);
-    }
-    return name;
-  }
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const final {
-    SmallVector<Type, 4> resultTypes;
-    for (auto type : op->getResultTypes()) {
-      resultTypes.push_back(typeConverter->convertType(type));
-    }
-    rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
-        op, resultTypes, getOpName(op), nullptr, nullptr, op->getOperands());
-    return success();
-  }
-
-  TTKernelToEmitCTypeConverter *typeConverter;
-};
-
-LogicalResult emitDispatchOpRegionAsCpp(DispatchOp origOp,
-                                        unsigned regionNumber,
-                                        llvm::raw_ostream &os) {
-  DispatchOp op = cast<DispatchOp>(origOp->clone());
-  auto cleanupDispatchClone = llvm::make_scope_exit([&op] { op->erase(); });
-  Region &region = op->getRegion(regionNumber);
-
-  OpBuilder builder(op.getOperation());
-
-  auto threadTypeAttr =
-      mlir::cast<ttkernel::ThreadTypeAttr>(op.getThreadTypes()[regionNumber]);
-
-  // Replace the original block with a the new block containing a module op
-  auto module = builder.create<mlir::ModuleOp>(
-      mlir::UnknownLoc::get(op.getContext()),
-      ttkernel::stringifyThreadType(threadTypeAttr.getValue()));
-  auto cleanupFreeModule =
-      llvm::make_scope_exit([&module] { module->erase(); });
-  auto &moduleBlock = module.getBodyRegion().front();
-  module->setDiscardableAttr(builder.getStringAttr("ttkernel.thread_type"),
-                             threadTypeAttr);
-  builder.setInsertionPointToStart(&moduleBlock);
-
-  builder.create<emitc::IncludeOp>(module.getLoc(), "cstdint",
-                                   /*isStandard=*/true);
-  if (threadTypeAttr.getValue() == ttkernel::ThreadType::Noc0 ||
-      threadTypeAttr.getValue() == ttkernel::ThreadType::Noc1) {
-    builder.create<emitc::IncludeOp>(module.getLoc(), "dataflow_api.h",
-                                     /*isStandard=*/false);
-  }
-  if (threadTypeAttr.getValue() == ttkernel::ThreadType::Tensix) {
-    builder.create<emitc::IncludeOp>(module.getLoc(),
-                                     "compute_kernel_api/common.h",
-                                     /*isStandard=*/false);
-    builder.create<emitc::IncludeOp>(module.getLoc(),
-                                     "compute_kernel_api/tilize.h",
-                                     /*isStandard=*/false);
-    builder.create<emitc::IncludeOp>(module.getLoc(),
-                                     "compute_kernel_api/untilize.h",
-                                     /*isStandard=*/false);
-    builder.create<emitc::IncludeOp>(module.getLoc(),
-                                     "compute_kernel_api/eltwise_binary.h",
-                                     /*isStandard=*/false);
-  }
-
-  if (threadTypeAttr.getValue() == ttkernel::ThreadType::Tensix) {
-    builder.create<emitc::VerbatimOp>(module.getLoc(), "namespace NAMESPACE {");
-  }
-
-  // Create a new func op and move the existing block into it.
-  auto func = builder.create<func::FuncOp>(
-      module.getLoc(), "kernel_main",
-      builder.getType<FunctionType>(region.getArgumentTypes(), TypeRange()));
-  Block *entryBlock = func.addEntryBlock();
-  Region *funcBody = entryBlock->getParent();
-  IRMapping irMapper;
-  funcBody->takeBody(region);
-
-  if (threadTypeAttr.getValue() == ttkernel::ThreadType::Tensix) {
-    builder.create<emitc::VerbatimOp>(module.getLoc(),
-                                      "void MAIN { kernel_main(); }");
-    builder.create<emitc::VerbatimOp>(module.getLoc(), "}");
-  }
-
-  // Apply arith to emitc conversion first
-  {
-    ConversionTarget target(*module.getContext());
-    target.addLegalDialect<emitc::EmitCDialect>();
-    target.addIllegalDialect<arith::ArithDialect>();
-    RewritePatternSet arithPatterns(module.getContext());
-    TypeConverter arithTypeConverter;
-    arithTypeConverter.addConversion([](Type type) { return type; });
-    populateArithToEmitCPatterns(arithTypeConverter, arithPatterns);
-    if (failed(
-            applyPartialConversion(module, target, std::move(arithPatterns)))) {
-      return failure();
-    }
-  }
-
-  // Apply scf to emitc conversion next
-  {
-    ConversionTarget target(*module.getContext());
-    target.addLegalDialect<emitc::EmitCDialect>();
-    target.addIllegalDialect<scf::SCFDialect>();
-    RewritePatternSet scfPatterns(module.getContext());
-    populateSCFToEmitCConversionPatterns(scfPatterns);
-    if (failed(
-            applyPartialConversion(module, target, std::move(scfPatterns)))) {
-      return failure();
-    }
-  }
-
-  TTKernelToEmitCTypeConverter typeConverter(module.getContext());
-  RewritePatternSet patterns(module.getContext());
-
-  patterns.add<TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::BuiltinOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsAcquireOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsCommitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsWaitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsReleaseOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::PackTileOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::CBPushBackOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::CBPopFrontOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::CBReserveBackOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::CBWaitFrontOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeInitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeInitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeBlockOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeBlockOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::BinaryOpInitCommonOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesInitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadBarrierOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteOp>,
-               TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteBarrierOp>>(
-      typeConverter, module.getContext());
-
-  FrozenRewritePatternSet patternSet(std::move(patterns));
-  if (failed(applyPatternsAndFoldGreedily(module, patternSet))) {
-    return failure();
-  }
-
-  if (emitc::translateToCpp(module.getOperation(), os).failed()) {
-    return failure();
-  }
-
-  return success();
-}
-
-} // namespace mlir::tt::ttmetal
diff --git a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp b/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
deleted file mode 100644
index 0bba6d61a1..0000000000
--- a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <fstream>
-
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Support/LogicalResult.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "ttmlir/Dialect/TT/IR/TT.h"
-#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
-#include "ttmlir/Dialect/TTMetal/IR/TTMetalOpsTypes.h"
-#include "ttmlir/Dialect/TTMetal/Transforms/KernelsToCpp.h"
-#include "ttmlir/Dialect/TTMetal/Transforms/Passes.h"
-#include "ttmlir/Target/TTMetal/Target.h"
-#include "ttmlir/Target/Utils/FlatbufferObjectCache.h"
-#include "ttmlir/Target/Utils/MLIRToFlatbuffer.h"
-#include "ttmlir/Version.h"
-
-namespace mlir::tt::ttmetal {
-
-#define GEN_PASS_DEF_TTMETALSERIALIZETOBINARY
-#include "ttmlir/Dialect/TTMetal/Transforms/Passes.h.inc"
-
-struct CQBuilder {
-  ::flatbuffers::FlatBufferBuilder *fbb;
-  const char *name;
-  std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> inputs;
-  std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> outputs;
-  std::vector<::flatbuffers::Offset<::tt::target::metal::Command>> commands;
-  OpPrintingFlags printFlags;
-
-  CQBuilder(::flatbuffers::FlatBufferBuilder *fbb) : fbb(fbb) {
-    printFlags = printFlags.elideLargeElementsAttrs()
-                     .elideLargeResourceString()
-                     .skipRegions()
-                     .enableDebugInfo();
-  }
-
-  std::string getDebugString(mlir::Operation *op) {
-    std::string str;
-    llvm::raw_string_ostream os(str);
-    op->print(os, printFlags);
-    return str;
-  };
-
-  template <typename CommandT>
-  ::flatbuffers::Offset<::tt::target::metal::Command>
-  appendCommand(::flatbuffers::Offset<CommandT> commandT, mlir::Operation *op) {
-    auto debugString = getDebugString(op);
-    commands.push_back(::tt::target::metal::CreateCommandDirect(
-        *fbb, ::tt::target::metal::CommandTypeTraits<CommandT>::enum_value,
-        commandT.Union(), debugString.c_str()));
-    return commands.back();
-  }
-};
-
-::tt::target::metal::SourceType toFlatbuffer(ttkernel::ThreadType threadType) {
-  switch (threadType) {
-  case ttkernel::ThreadType::Noc0:
-    return ::tt::target::metal::SourceType::Noc0;
-  case ttkernel::ThreadType::Noc1:
-    return ::tt::target::metal::SourceType::Noc1;
-  case ttkernel::ThreadType::Tensix:
-    return ::tt::target::metal::SourceType::Tensix;
-  case ttkernel::ThreadType::Ethernet:
-    return ::tt::target::metal::SourceType::Ethernet;
-  }
-}
-
-::tt::target::Dim2dRange toFlatbuffer(CoreRangeAttr coreRange) {
-  auto offset = coreRange.getOffset();
-  auto size = coreRange.getSize();
-  return ::tt::target::Dim2dRange(::tt::target::Dim2d(offset[0], offset[1]),
-                                  ::tt::target::Dim2d(size[0], size[1]));
-}
-
-::flatbuffers::Offset<::tt::target::CBDesc>
-cbTypeToFlatbuffer(FlatbufferObjectCache &cache, ttkernel::CBType cbType) {
-  auto memref = cache.getOrCreate(cbType.getMemref(), memrefAttrToFlatbuffer);
-  return ::tt::target::CreateCBDesc(
-      *cache.fbb,
-      static_cast<std::underlying_type_t<ttkernel::CBPort>>(cbType.getPort()),
-      memref, cbType.getPageSize(), cbType.getNumBuffers());
-}
-
-class TTMetalSerializeToBinary
-    : public impl::TTMetalSerializeToBinaryBase<TTMetalSerializeToBinary> {
-public:
-  using impl::TTMetalSerializeToBinaryBase<
-      TTMetalSerializeToBinary>::TTMetalSerializeToBinaryBase;
-
-  Value getOperandThroughDPSOps(Value value) {
-    auto *op = value.getDefiningOp();
-    if (!op) {
-      return value;
-    }
-    while (isa<DestinationStyleOpInterface>(op)) {
-      assert(op->getResults().size() == 1);
-      auto dps = cast<DestinationStyleOpInterface>(op);
-      assert(dps.getNumDpsInits() == 1);
-      auto *opOperand = dps.getDpsInitOperand(0);
-      value = opOperand->get();
-      op = value.getDefiningOp();
-    }
-    return value;
-  }
-
-  void runOnOperation() final {
-    ::flatbuffers::FlatBufferBuilder fbb;
-    FlatbufferObjectCache cache(&fbb);
-
-    ModuleOp module = getOperation();
-    auto systemDesc = mlir::cast<tt::SystemDescAttr>(
-        module->getAttr(tt::SystemDescAttr::name));
-    ::ttmlir::Version ttmlirVersion = ::ttmlir::getVersion();
-    ::tt::target::Version binaryVersion(
-        ttmlirVersion.major, ttmlirVersion.minor, ttmlirVersion.patch);
-    std::vector<::flatbuffers::Offset<::tt::target::metal::Program>> programs;
-
-    module->walk([&](func::FuncOp entry) {
-      CQBuilder cqBuilder(&fbb);
-      cqBuilder.name = entry.getSymName().data();
-
-      auto argumentAllocations = mlir::cast<ArrayAttr>(
-          entry->getDiscardableAttr(ArgumentAllocationAttr::name));
-      assert(argumentAllocations && "expected argument_allocations attribute");
-      for (auto &input : entry.getBody().getArguments()) {
-        auto argAlloc = mlir::cast<tt::ArgumentAllocationAttr>(
-            argumentAllocations[input.getArgNumber()]);
-        assert(
-            argAlloc.getMemorySpace() ==
-                mlir::cast<tt::LayoutAttr>(
-                    mlir::cast<RankedTensorType>(input.getType()).getEncoding())
-                    .getMemorySpace() &&
-            "argument allocation memory space does not match tensor type "
-            "memory "
-            "space");
-        cqBuilder.inputs.push_back(
-            cache.getOrCreate(input, tensorValueToFlatbuffer,
-                              argAlloc.getAddress(), argAlloc.getSize()));
-      }
-
-      entry->walk([&](mlir::Operation *op) {
-        if (auto dispatchOp = dyn_cast_or_null<tt::ttmetal::DispatchOp>(op);
-            dispatchOp) {
-          std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> operands;
-          for (auto operand : dispatchOp.getOperands()) {
-            operands.push_back(cache.at<::tt::target::TensorRef>(
-                getOperandThroughDPSOps(operand)));
-          }
-
-          std::vector<::flatbuffers::Offset<::tt::target::metal::KernelDesc>>
-              kernels;
-          for (auto &region : dispatchOp.getRegions()) {
-            std::string source;
-            llvm::raw_string_ostream os(source);
-            auto result = emitDispatchOpRegionAsCpp(
-                dispatchOp, region.getRegionNumber(), os);
-            assert(succeeded(result) &&
-                   "failed to emit dispatch op region as cpp");
-            auto threadType =
-                mlir::cast<ttkernel::ThreadTypeAttr>(
-                    dispatchOp.getThreadTypes()[region.getRegionNumber()])
-                    .getValue();
-            std::vector<::tt::target::Dim2dRange> coreRangeSet = {
-                toFlatbuffer(mlir::cast<CoreRangeAttr>(
-                    dispatchOp.getCoreRanges()[region.getRegionNumber()]))};
-            std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs;
-            for (auto arg : region.getArguments()) {
-              assert(arg.getArgNumber() < operands.size());
-              auto cbType = mlir::cast<ttkernel::CBType>(arg.getType());
-              auto cbDesc = cache.getOrCreate(cbType, cbTypeToFlatbuffer);
-              auto tensorRef = operands[arg.getArgNumber()];
-              cbs.push_back(
-                  ::tt::target::CreateCBRef(fbb, cache.global_id++, tensorRef,
-                                            cbType.getAddress(), cbDesc));
-            }
-            kernels.push_back(::tt::target::metal::CreateKernelDescDirect(
-                fbb, ::tt::target::metal::Kernel::KernelSource,
-                ::tt::target::metal::CreateKernelSourceDirect(
-                    fbb, toFlatbuffer(threadType), source.c_str())
-                    .Union(),
-                &coreRangeSet, &cbs, nullptr /*TODO debug info*/));
-          }
-          ::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =
-              ::tt::target::metal::CreateProgramDescDirect(fbb, &kernels);
-
-          cqBuilder.appendCommand(
-              ::tt::target::metal::CreateEnqueueProgramCommandDirect(
-                  fbb, &operands, program),
-              op);
-        } else if (auto allocOp = dyn_cast_or_null<tt::ttmetal::AllocOp>(op);
-                   allocOp) {
-          cqBuilder.appendCommand(
-              ::tt::target::metal::CreateCreateBufferCommand(
-                  fbb, cache.getOrCreate(
-                           allocOp.getResult(), tensorValueToFlatbuffer,
-                           allocOp.getAddress(), allocOp.getSize())),
-              op);
-        } else if (auto deallocOp =
-                       dyn_cast_or_null<tt::ttmetal::DeallocOp>(op);
-                   deallocOp) {
-          cqBuilder.appendCommand(
-              ::tt::target::metal::CreateDeallocateBufferCommand(
-                  fbb, cache.at<::tt::target::TensorRef>(
-                           getOperandThroughDPSOps(deallocOp.getInput()))),
-              op);
-        } else if (auto hostReadOp =
-                       dyn_cast_or_null<tt::ttmetal::HostReadOp>(op);
-                   hostReadOp) {
-          cqBuilder.appendCommand(
-              ::tt::target::metal::CreateEnqueueReadBufferCommand(
-                  fbb,
-                  cache.at<::tt::target::TensorRef>(
-                      getOperandThroughDPSOps(hostReadOp.getInput())),
-                  cache.at<::tt::target::TensorRef>(
-                      getOperandThroughDPSOps(hostReadOp.getOutput()))),
-              op);
-        } else if (auto hostWriteOp =
-                       dyn_cast_or_null<tt::ttmetal::HostWriteOp>(op);
-                   hostWriteOp) {
-          cqBuilder.appendCommand(
-              ::tt::target::metal::CreateEnqueueWriteBufferCommand(
-                  fbb,
-                  cache.at<::tt::target::TensorRef>(
-                      getOperandThroughDPSOps(hostWriteOp.getInput())),
-                  cache.at<::tt::target::TensorRef>(
-                      getOperandThroughDPSOps(hostWriteOp.getOutput()))),
-              op);
-        } else if (auto returnOp = dyn_cast_or_null<func::ReturnOp>(op);
-                   returnOp) {
-          for (auto output : returnOp.getOperands()) {
-            cqBuilder.outputs.push_back(cache.at<::tt::target::TensorRef>(
-                getOperandThroughDPSOps(output)));
-          }
-        }
-      });
-
-      std::vector<::flatbuffers::Offset<::tt::target::metal::CommandQueue>>
-          commandQueues = {
-              ::tt::target::metal::CreateCommandQueueDirect(
-                  fbb, cqBuilder.name, &cqBuilder.commands),
-          };
-
-      std::vector<::flatbuffers::Offset<::tt::target::metal::DeviceProgram>>
-          devicePrograms = {
-              ::tt::target::metal::CreateDeviceProgramDirect(
-                  fbb, &cqBuilder.inputs, &cqBuilder.outputs, &commandQueues),
-          };
-      programs.push_back(::tt::target::metal::CreateProgramDirect(
-          fbb, cqBuilder.name, &cqBuilder.inputs, &cqBuilder.outputs,
-          &devicePrograms));
-    });
-
-    auto binary = ::tt::target::metal::CreateTTMetalBinaryDirect(
-        fbb, &binaryVersion, ::ttmlir::getGitHash(),
-        toFlatbuffer(cache, systemDesc), &programs);
-
-    FinishSizePrefixedTTMetalBinaryBuffer(fbb, binary);
-    ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
-    ::tt::target::metal::VerifySizePrefixedTTMetalBinaryBuffer(verifier);
-
-    uint8_t *buf = fbb.GetBufferPointer();
-    auto size = fbb.GetSize();
-
-    serializedBinary = std::shared_ptr<void>(std::malloc(size), std::free);
-    std::memcpy(serializedBinary.get(), buf, size);
-
-    if (not output.empty()) {
-      std::ofstream ttb(output, std::ios::out | std::ios::binary);
-      ttb.write(reinterpret_cast<char const *>(buf), size);
-      ttb.close();
-    }
-  }
-
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::tt::ttmetal::TTMetalDialect>();
-    registry.insert<mlir::tt::ttkernel::TTKernelDialect>();
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::emitc::EmitCDialect>();
-  }
-
-  std::shared_ptr<void> getBinary() const { return serializedBinary; }
-
-private:
-  std::shared_ptr<void> serializedBinary;
-};
-
-} // namespace mlir::tt::ttmetal
diff --git a/lib/SharedLib/CMakeLists.txt b/lib/SharedLib/CMakeLists.txt
index 81eeb5ae5b..032be9cad9 100644
--- a/lib/SharedLib/CMakeLists.txt
+++ b/lib/SharedLib/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TTNN_LIBS TTMETAL_LIBRARY TTNN_LIBRARY)
 # Libs from tt-mlir project
 set(TTMLIR_LIBS
     TTNNTargetFlatbuffer
+    TTMetalTargetFlatbuffer
     MLIRTTDialect
     MLIRTTIRDialect
     MLIRTTNNDialect
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 9c34667d09..2043ae34fa 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1 +1,2 @@
+add_subdirectory(TTMetal)
 add_subdirectory(TTNN)
diff --git a/lib/Target/TTMetal/CMakeLists.txt b/lib/Target/TTMetal/CMakeLists.txt
new file mode 100644
index 0000000000..0dbf4dc6da
--- /dev/null
+++ b/lib/Target/TTMetal/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_translation_library(TTMetalTargetFlatbuffer
+    TTMetalToFlatbuffer.cpp
+    TTMetalToFlatbufferRegistration.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${PROJECT_SOURCE_DIR}/include/Target/TTMetal
+
+    LINK_LIBS PUBLIC
+    MLIRTTMetalDialect
+    MLIRTTKernelDialect
+    MLIRTTIRDialect
+    MLIRTTDialect
+    MLIRTTMetalTransforms
+    TTMLIRTTKernelToEmitC
+)
+
+target_include_directories(TTMetalTargetFlatbuffer PUBLIC ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common)
diff --git a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
new file mode 100644
index 0000000000..0f09ac5bc9
--- /dev/null
+++ b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
@@ -0,0 +1,289 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <memory>
+
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h"
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetalOpsTypes.h"
+#include "ttmlir/Target/TTMetal/Target.h"
+#include "ttmlir/Target/Utils/FlatbufferObjectCache.h"
+#include "ttmlir/Target/Utils/MLIRToFlatbuffer.h"
+#include "ttmlir/Version.h"
+
+namespace mlir::tt::ttmetal {
+
+struct CQBuilder {
+  ::flatbuffers::FlatBufferBuilder *fbb;
+  const char *name;
+  std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> inputs;
+  std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> outputs;
+  std::vector<::flatbuffers::Offset<::tt::target::metal::Command>> commands;
+  OpPrintingFlags printFlags;
+
+  CQBuilder(::flatbuffers::FlatBufferBuilder *fbb) : fbb(fbb) {
+    printFlags = printFlags.elideLargeElementsAttrs()
+                     .elideLargeResourceString()
+                     .skipRegions()
+                     .enableDebugInfo();
+  }
+
+  std::string getDebugString(mlir::Operation *op) {
+    std::string str;
+    llvm::raw_string_ostream os(str);
+    op->print(os, printFlags);
+    return str;
+  };
+
+  template <typename CommandT>
+  ::flatbuffers::Offset<::tt::target::metal::Command>
+  appendCommand(::flatbuffers::Offset<CommandT> commandT, mlir::Operation *op) {
+    auto debugString = getDebugString(op);
+    commands.push_back(::tt::target::metal::CreateCommandDirect(
+        *fbb, ::tt::target::metal::CommandTypeTraits<CommandT>::enum_value,
+        commandT.Union(), debugString.c_str()));
+    return commands.back();
+  }
+};
+
+::tt::target::metal::SourceType toFlatbuffer(ttkernel::ThreadType threadType) {
+  switch (threadType) {
+  case ttkernel::ThreadType::Noc0:
+    return ::tt::target::metal::SourceType::Noc0;
+  case ttkernel::ThreadType::Noc1:
+    return ::tt::target::metal::SourceType::Noc1;
+  case ttkernel::ThreadType::Tensix:
+    return ::tt::target::metal::SourceType::Tensix;
+  case ttkernel::ThreadType::Ethernet:
+    return ::tt::target::metal::SourceType::Ethernet;
+  }
+}
+
+::tt::target::Dim2dRange toFlatbuffer(CoreRangeAttr coreRange) {
+  auto offset = coreRange.getOffset();
+  auto size = coreRange.getSize();
+  return ::tt::target::Dim2dRange(::tt::target::Dim2d(offset[0], offset[1]),
+                                  ::tt::target::Dim2d(size[0], size[1]));
+}
+
+::flatbuffers::Offset<::tt::target::CBDesc>
+cbTypeToFlatbuffer(FlatbufferObjectCache &cache, ttkernel::CBType cbType) {
+  auto memref = cache.getOrCreate(cbType.getMemref(), memrefAttrToFlatbuffer);
+  return ::tt::target::CreateCBDesc(
+      *cache.fbb,
+      static_cast<std::underlying_type_t<ttkernel::CBPort>>(cbType.getPort()),
+      memref, cbType.getPageSize(), cbType.getNumBuffers());
+}
+
+Value getOperandThroughDPSOps(Value value) {
+  auto *op = value.getDefiningOp();
+  if (!op) {
+    return value;
+  }
+  while (isa<DestinationStyleOpInterface>(op)) {
+    assert(op->getResults().size() == 1);
+    auto dps = cast<DestinationStyleOpInterface>(op);
+    assert(dps.getNumDpsInits() == 1);
+    auto *opOperand = dps.getDpsInitOperand(0);
+    value = opOperand->get();
+    op = value.getDefiningOp();
+  }
+  return value;
+}
+
+static std::shared_ptr<void> translateModuleToFlatbuffer(Operation *op) {
+  ::flatbuffers::FlatBufferBuilder fbb;
+  FlatbufferObjectCache cache(&fbb);
+
+  ModuleOp module = dyn_cast<ModuleOp>(op);
+  assert(module && "Expected ModuleOp as top level operation");
+
+  auto systemDesc =
+      mlir::cast<tt::SystemDescAttr>(module->getAttr(tt::SystemDescAttr::name));
+  ::ttmlir::Version ttmlirVersion = ::ttmlir::getVersion();
+  ::tt::target::Version binaryVersion(ttmlirVersion.major, ttmlirVersion.minor,
+                                      ttmlirVersion.patch);
+  std::vector<::flatbuffers::Offset<::tt::target::metal::Program>> programs;
+
+  module->walk([&](func::FuncOp entry) {
+    CQBuilder cqBuilder(&fbb);
+    cqBuilder.name = entry.getSymName().data();
+
+    auto argumentAllocations = mlir::cast<ArrayAttr>(
+        entry->getDiscardableAttr(ArgumentAllocationAttr::name));
+    assert(argumentAllocations && "expected argument_allocations attribute");
+    for (auto &input : entry.getBody().getArguments()) {
+      auto argAlloc = mlir::cast<tt::ArgumentAllocationAttr>(
+          argumentAllocations[input.getArgNumber()]);
+      assert(
+          argAlloc.getMemorySpace() ==
+              mlir::cast<tt::LayoutAttr>(
+                  mlir::cast<RankedTensorType>(input.getType()).getEncoding())
+                  .getMemorySpace() &&
+          "argument allocation memory space does not match tensor type "
+          "memory "
+          "space");
+      cqBuilder.inputs.push_back(
+          cache.getOrCreate(input, tensorValueToFlatbuffer,
+                            argAlloc.getAddress(), argAlloc.getSize()));
+    }
+
+    entry->walk([&](mlir::Operation *op) {
+      if (auto dispatchOp = dyn_cast_or_null<tt::ttmetal::DispatchOp>(op);
+          dispatchOp) {
+        std::vector<::flatbuffers::Offset<::tt::target::TensorRef>> operands;
+        for (auto operand : dispatchOp.getOperands()) {
+          operands.push_back(cache.at<::tt::target::TensorRef>(
+              getOperandThroughDPSOps(operand)));
+        }
+
+        std::vector<::flatbuffers::Offset<::tt::target::metal::KernelDesc>>
+            kernels;
+
+        llvm::SmallVector<std::string> cppKernels(dispatchOp->getNumRegions());
+        llvm::LogicalResult success =
+            emitDispatchOpRegionsAsCpp(dispatchOp, cppKernels);
+        assert(success.succeeded() &&
+               "failed to emit dispatch op regions as cpp");
+
+        for (auto &region : dispatchOp.getRegions()) {
+          auto threadType =
+              mlir::cast<ttkernel::ThreadTypeAttr>(
+                  dispatchOp.getThreadTypes()[region.getRegionNumber()])
+                  .getValue();
+          std::vector<::tt::target::Dim2dRange> coreRangeSet = {
+              toFlatbuffer(mlir::cast<CoreRangeAttr>(
+                  dispatchOp.getCoreRanges()[region.getRegionNumber()]))};
+          std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs;
+          for (auto arg : region.getArguments()) {
+            assert(arg.getArgNumber() < operands.size());
+            auto cbType = mlir::cast<ttkernel::CBType>(arg.getType());
+            auto cbDesc = cache.getOrCreate(cbType, cbTypeToFlatbuffer);
+            auto tensorRef = operands[arg.getArgNumber()];
+            cbs.push_back(
+                ::tt::target::CreateCBRef(fbb, cache.global_id++, tensorRef,
+                                          cbType.getAddress(), cbDesc));
+          }
+
+          std::string &source = cppKernels[region.getRegionNumber()];
+          assert(source.size() > 0 && "empty kernel source");
+
+          kernels.push_back(::tt::target::metal::CreateKernelDescDirect(
+              fbb, ::tt::target::metal::Kernel::KernelSource,
+              ::tt::target::metal::CreateKernelSourceDirect(
+                  fbb, toFlatbuffer(threadType), source.c_str())
+                  .Union(),
+              &coreRangeSet, &cbs, nullptr /*TODO debug info*/));
+        }
+        ::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =
+            ::tt::target::metal::CreateProgramDescDirect(fbb, &kernels);
+
+        cqBuilder.appendCommand(
+            ::tt::target::metal::CreateEnqueueProgramCommandDirect(
+                fbb, &operands, program),
+            op);
+      } else if (auto allocOp = dyn_cast_or_null<tt::ttmetal::AllocOp>(op);
+                 allocOp) {
+        cqBuilder.appendCommand(
+            ::tt::target::metal::CreateCreateBufferCommand(
+                fbb,
+                cache.getOrCreate(allocOp.getResult(), tensorValueToFlatbuffer,
+                                  allocOp.getAddress(), allocOp.getSize())),
+            op);
+      } else if (auto deallocOp = dyn_cast_or_null<tt::ttmetal::DeallocOp>(op);
+                 deallocOp) {
+        cqBuilder.appendCommand(
+            ::tt::target::metal::CreateDeallocateBufferCommand(
+                fbb, cache.at<::tt::target::TensorRef>(
+                         getOperandThroughDPSOps(deallocOp.getInput()))),
+            op);
+      } else if (auto hostReadOp =
+                     dyn_cast_or_null<tt::ttmetal::HostReadOp>(op);
+                 hostReadOp) {
+        cqBuilder.appendCommand(
+            ::tt::target::metal::CreateEnqueueReadBufferCommand(
+                fbb,
+                cache.at<::tt::target::TensorRef>(
+                    getOperandThroughDPSOps(hostReadOp.getInput())),
+                cache.at<::tt::target::TensorRef>(
+                    getOperandThroughDPSOps(hostReadOp.getOutput()))),
+            op);
+      } else if (auto hostWriteOp =
+                     dyn_cast_or_null<tt::ttmetal::HostWriteOp>(op);
+                 hostWriteOp) {
+        cqBuilder.appendCommand(
+            ::tt::target::metal::CreateEnqueueWriteBufferCommand(
+                fbb,
+                cache.at<::tt::target::TensorRef>(
+                    getOperandThroughDPSOps(hostWriteOp.getInput())),
+                cache.at<::tt::target::TensorRef>(
+                    getOperandThroughDPSOps(hostWriteOp.getOutput()))),
+            op);
+      } else if (auto returnOp = dyn_cast_or_null<func::ReturnOp>(op);
+                 returnOp) {
+        for (auto output : returnOp.getOperands()) {
+          cqBuilder.outputs.push_back(cache.at<::tt::target::TensorRef>(
+              getOperandThroughDPSOps(output)));
+        }
+      }
+    });
+
+    std::vector<::flatbuffers::Offset<::tt::target::metal::CommandQueue>>
+        commandQueues = {
+            ::tt::target::metal::CreateCommandQueueDirect(fbb, cqBuilder.name,
+                                                          &cqBuilder.commands),
+        };
+
+    std::vector<::flatbuffers::Offset<::tt::target::metal::DeviceProgram>>
+        devicePrograms = {
+            ::tt::target::metal::CreateDeviceProgramDirect(
+                fbb, &cqBuilder.inputs, &cqBuilder.outputs, &commandQueues),
+        };
+    programs.push_back(::tt::target::metal::CreateProgramDirect(
+        fbb, cqBuilder.name, &cqBuilder.inputs, &cqBuilder.outputs,
+        &devicePrograms));
+  });
+
+  auto binary = ::tt::target::metal::CreateTTMetalBinaryDirect(
+      fbb, &binaryVersion, ::ttmlir::getGitHash(),
+      toFlatbuffer(cache, systemDesc), &programs);
+
+  FinishSizePrefixedTTMetalBinaryBuffer(fbb, binary);
+  ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  ::tt::target::metal::VerifySizePrefixedTTMetalBinaryBuffer(verifier);
+
+  uint8_t *buf = fbb.GetBufferPointer();
+  auto size = fbb.GetSize();
+
+  std::shared_ptr<void> serializedBinary =
+      std::shared_ptr<void>(std::malloc(size), std::free);
+  std::memcpy(serializedBinary.get(), buf, size);
+
+  return serializedBinary;
+}
+
+LogicalResult translateTTMetalToFlatbuffer(Operation *op,
+                                           llvm::raw_ostream &os) {
+  std::shared_ptr<void> data = translateModuleToFlatbuffer(op);
+  std::size_t size = ::flatbuffers::GetSizePrefixedBufferLength(
+      static_cast<const uint8_t *>(data.get()));
+  os.write(reinterpret_cast<char const *>(data.get()), size);
+  return success();
+}
+
+} // namespace mlir::tt::ttmetal
diff --git a/lib/Target/TTMetal/TTMetalToFlatbufferRegistration.cpp b/lib/Target/TTMetal/TTMetalToFlatbufferRegistration.cpp
new file mode 100644
index 0000000000..b713da66aa
--- /dev/null
+++ b/lib/Target/TTMetal/TTMetalToFlatbufferRegistration.cpp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
+
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetal.h"
+#include "ttmlir/Target/TTMetal/TTMetalToFlatbuffer.h"
+
+using namespace mlir;
+
+namespace mlir::tt::ttmetal {
+
+void registerTTMetalToFlatbuffer() {
+  TranslateFromMLIRRegistration reg(
+      "ttmetal-to-flatbuffer", "translate ttmetal dialect to flatbuffer",
+      translateTTMetalToFlatbuffer /* function */,
+      [](DialectRegistry &registry) {
+        registry.insert<mlir::tt::TTDialect, mlir::tt::ttmetal::TTMetalDialect,
+                        mlir::tt::ttkernel::TTKernelDialect,
+                        mlir::emitc::EmitCDialect, mlir::func::FuncDialect>();
+      });
+}
+
+} // namespace mlir::tt::ttmetal
diff --git a/test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir b/test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir
new file mode 100644
index 0000000000..09c2c18324
--- /dev/null
+++ b/test/ttmlir/Conversion/TTKernelToEmitC/ttkernel.mlir
@@ -0,0 +1,50 @@
+// RUN: ttmlir-opt --convert-ttkernel-to-emitc %s | FileCheck %s
+#l1_ = #tt.memory_space<l1>
+module attributes {} {
+  func.func @ttkernel_noc() -> () {
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c262432_i32 = arith.constant 262432 : i32
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c262208_i32 = arith.constant 262208 : i32
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c32_i32 = arith.constant 32 : i32
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c262400_i32 = arith.constant 262400 : i32
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c0_i32 = arith.constant 0 : i32
+    // CHECK: [[C:.*]] = "emitc.constant"[[C:.*]]
+    %c262144_i32 = arith.constant 262144 : i32
+    // CHECK: [[C:.*]] = emitc.call_opaque "get_noc_addr"[[C:.*]]
+    %3 = "ttkernel.get_noc_addr"(%c0_i32, %c0_i32, %c262144_i32) : (i32, i32, i32) -> !ttkernel.noc_addr
+    // CHECK: emitc.call_opaque "noc_async_read"[[C:.*]]
+    "ttkernel.noc_async_read"(%3, %c262400_i32, %c32_i32) : (!ttkernel.noc_addr, i32, i32) -> ()
+    // CHECK: [[C:.*]] = emitc.call_opaque "get_noc_addr"[[C:.*]]
+    %4 = "ttkernel.get_noc_addr"(%c0_i32, %c0_i32, %c262208_i32) : (i32, i32, i32) -> !ttkernel.noc_addr
+    // CHECK: emitc.call_opaque "noc_async_read"[[C:.*]]
+    "ttkernel.noc_async_read"(%4, %c262432_i32, %c32_i32) : (!ttkernel.noc_addr, i32, i32) -> ()
+    // CHECK: emitc.call_opaque "noc_async_read_barrier"[[C:.*]]
+    "ttkernel.noc_async_read_barrier"() : () -> ()
+    "ttkernel.return"() : () -> ()
+  }
+
+  func.func @ttkernel_tensix(%arg1: !ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>,
+                             %arg2: !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> () {
+      %c4_i32 = arith.constant 4 : i32
+      // CHECK: emitc.call_opaque "untilize_init"[[C:.*]]
+      "ttkernel.untilize_init"(%arg1, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
+      // CHECK: emitc.call_opaque "untilize_block"[[C:.*]]
+      "ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
+      // CHECK: emitc.call_opaque "cb_pop_front"[[C:.*]]
+      "ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32) -> ()
+      // CHECK: emitc.call_opaque "cb_push_back"[[C:.*]]
+      "ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>, i32) -> ()
+      // CHECK: emitc.call_opaque "untilize_block"[[C:.*]]
+      "ttkernel.untilize_block"(%arg1, %c4_i32, %arg2) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32, !ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>) -> ()
+      // CHECK: emitc.call_opaque "cb_pop_front"[[C:.*]]
+      "ttkernel.cb_pop_front"(%arg1, %c4_i32) : (!ttkernel.cb<cb_in0, 294912, memref<2x4x!tt.tile<32x32, f32>, #l1_>, 4096, 1>, i32) -> ()
+      // CHECK: emitc.call_opaque "cb_push_back"[[C:.*]]
+      "ttkernel.cb_push_back"(%arg2, %c4_i32) : (!ttkernel.cb<cb_out0, 327680, memref<64x128xf32, #l1_>, 4096, 1>, i32) -> ()
+      // CHECK: return
+      "ttkernel.return"() : () -> ()
+  }
+}
diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
index fdd65864df..1162bfc5c3 100644
--- a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline  %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 
 func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index 1cebfe4515..d42e2bda18 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
 #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
index f268e7b397..5c59c9695f 100644
--- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir
+++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal --ttmetal-serialize-to-binary="output=%t.ttm" %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
 #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>
diff --git a/tools/ttmlir-translate/CMakeLists.txt b/tools/ttmlir-translate/CMakeLists.txt
index b03fb80681..e57bdbbd1d 100644
--- a/tools/ttmlir-translate/CMakeLists.txt
+++ b/tools/ttmlir-translate/CMakeLists.txt
@@ -1,6 +1,6 @@
 get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-set(LIBS ${translation_libs} ${dialect_libs} TTMLIRTTNNToEmitC TTNNTargetFlatbuffer)
+set(LIBS ${translation_libs} ${dialect_libs} TTMLIRTTNNToEmitC TTNNTargetFlatbuffer TTMetalTargetFlatbuffer)
 add_llvm_executable(ttmlir-translate ttmlir-translate.cpp)
 
 llvm_update_compile_flags(ttmlir-translate)
diff --git a/tools/ttmlir-translate/ttmlir-translate.cpp b/tools/ttmlir-translate/ttmlir-translate.cpp
index a8c53d04df..a3ec6154dc 100644
--- a/tools/ttmlir-translate/ttmlir-translate.cpp
+++ b/tools/ttmlir-translate/ttmlir-translate.cpp
@@ -5,6 +5,7 @@
 #include "mlir/InitAllTranslations.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
+#include <mlir/IR/DialectRegistry.h>
 
 using namespace mlir;
 
@@ -12,10 +13,15 @@ namespace mlir::tt::ttnn {
 void registerTTNNToFlatbuffer();
 } // namespace mlir::tt::ttnn
 
+namespace mlir::tt::ttmetal {
+void registerTTMetalToFlatbuffer();
+} // namespace mlir::tt::ttmetal
+
 // Place to register all the custom translations
 static void registerCustomTranslations() {
   static bool initOnce = []() {
     mlir::tt::ttnn::registerTTNNToFlatbuffer();
+    mlir::tt::ttmetal::registerTTMetalToFlatbuffer();
     return true;
   }();
   (void)initOnce;
@@ -24,5 +30,6 @@ static void registerCustomTranslations() {
 int main(int argc, char **argv) {
   registerAllTranslations();
   registerCustomTranslations();
+
   return failed(mlirTranslateMain(argc, argv, "MLIR Translation Testing Tool"));
 }

From 75bd688e36420f57aac83950293d22d5addaeac4 Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <157365107+sdjordjevicTT@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:01:00 +0200
Subject: [PATCH 11/16] Renaming generic ElementwiseBinaryOpConversionPattern
 to ElementwiseOpConversionPattern (#579)

---
 lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
index f5c3c460c5..e3e756a16f 100644
--- a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -75,8 +75,7 @@ class ToLayoutOpConversionPattern
 
 template <typename TTIROpTy, typename TTNNOpTy,
           typename OpAdaptor = typename TTIROpTy::Adaptor>
-class ElementwiseBinaryOpConversionPattern
-    : public OpConversionPattern<TTIROpTy> {
+class ElementwiseOpConversionPattern : public OpConversionPattern<TTIROpTy> {
 public:
   using OpConversionPattern<TTIROpTy>::OpConversionPattern;
 
@@ -364,16 +363,16 @@ void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
   patterns
       .add<TensorEmptyConversionPattern,
            ToLayoutOpConversionPattern,
-           ElementwiseBinaryOpConversionPattern<ttir::AddOp, ttnn::AddOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::SubtractOp, ttnn::SubtractOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::MultiplyOp, ttnn::MultiplyOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::GreaterEqualOp, ttnn::GreaterEqualOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::ReluOp, ttnn::ReluOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::SqrtOp, ttnn::SqrtOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::SigmoidOp, ttnn::SigmoidOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::ReciprocalOp, ttnn::ReciprocalOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::ExpOp, ttnn::ExpOp>,
-           ElementwiseBinaryOpConversionPattern<ttir::DivOp, ttnn::DivOp>,
+           ElementwiseOpConversionPattern<ttir::AddOp, ttnn::AddOp>,
+           ElementwiseOpConversionPattern<ttir::SubtractOp, ttnn::SubtractOp>,
+           ElementwiseOpConversionPattern<ttir::MultiplyOp, ttnn::MultiplyOp>,
+           ElementwiseOpConversionPattern<ttir::GreaterEqualOp, ttnn::GreaterEqualOp>,
+           ElementwiseOpConversionPattern<ttir::ReluOp, ttnn::ReluOp>,
+           ElementwiseOpConversionPattern<ttir::SqrtOp, ttnn::SqrtOp>,
+           ElementwiseOpConversionPattern<ttir::SigmoidOp, ttnn::SigmoidOp>,
+           ElementwiseOpConversionPattern<ttir::ReciprocalOp, ttnn::ReciprocalOp>,
+           ElementwiseOpConversionPattern<ttir::ExpOp, ttnn::ExpOp>,
+           ElementwiseOpConversionPattern<ttir::DivOp, ttnn::DivOp>,
            ReductionOpConversionPattern<ttir::SumOp, ttnn::SumOp>,
            ReductionOpConversionPattern<ttir::MeanOp, ttnn::MeanOp>,
            EmbeddingOpConversionPattern,

From c75811beed96cbcc7b3cb16b16febf3d1d2ea988 Mon Sep 17 00:00:00 2001
From: Jackson Nie <jnie@tenstorrent.com>
Date: Tue, 3 Sep 2024 13:52:10 -0400
Subject: [PATCH 12/16] Add sharding support for ttnn backend (#541)

* Adds tensor memory layout attribute to LayoutAttr to be consumed by the TTNN backend. Will need refactor in the future as metal backend does not need this field #596
* Adds runtime support for generating sharded memory configs accordingly. Currently only BlockSharded is supported
* Adds eltwise sharding tests under Silicon/TTNN/sharded
---
 include/ttmlir-c/TTAttrs.h                    |  11 +-
 include/ttmlir/Dialect/TT/IR/TTOpsEnums.td    |  43 ++-
 include/ttmlir/Dialect/TT/IR/TTOpsTypes.h     |  10 +
 include/ttmlir/Dialect/TT/IR/TTOpsTypes.td    |  21 +-
 include/ttmlir/Dialect/TTIR/IR/TTIROps.td     |  17 +-
 .../ttmlir/Dialect/TTIR/Transforms/Passes.td  |   6 +-
 include/ttmlir/Target/Common/types.fbs        |  10 +
 .../ttmlir/Target/Utils/MLIRToFlatbuffer.h    |  27 +-
 lib/CAPI/TTAttrs.cpp                          |  11 +-
 lib/Dialect/TT/IR/TTDialect.cpp               |   2 +
 lib/Dialect/TT/IR/TTOpsTypes.cpp              |  35 ++-
 lib/Dialect/TTIR/IR/TTIROps.cpp               |   8 +-
 lib/Dialect/TTIR/Transforms/Passes.cpp        | 138 +++++++---
 lib/Dialect/TTMetal/Transforms/Passes.cpp     |  24 +-
 lib/Dialect/TTNN/IR/TTNNOps.cpp               |  42 +++
 lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp  |   2 +
 lib/Target/TTMetal/TTMetalToFlatbuffer.cpp    |   3 +-
 python/TTModule.cpp                           |  15 +-
 runtime/lib/ttnn/program.cpp                  | 248 +++++++++++++-----
 runtime/lib/ttnn/utils.h                      |  22 +-
 test/python/tensor_layout.py                  |  33 ++-
 .../Dialect/TTIR/split_compound_layout.mlir   |  28 +-
 test/ttmlir/Dialect/TTIR/test_allocate.mlir   |   2 +-
 test/ttmlir/Dialect/TTIR/test_grid_set.mlir   |   2 +-
 .../TTNN/eltwise/unary/relu/simple_relu.mlir  |  15 +-
 .../Dialect/TTNN/multiple_add_with_loc.mlir   |   2 +-
 .../multiple_add_with_loc_grid_override.mlir  |   6 +-
 test/ttmlir/Dialect/TTNN/simple_matmul.mlir   |   2 +-
 .../Dialect/TTNN/ttir_to_ttnn_pipeline.mlir   |   2 +-
 .../ttir_to_ttnn_pipeline_custom_opt.mlir     |   2 +-
 .../ttmlir/Silicon/TTMetal/tiled_reblock.mlir |  24 +-
 test/ttmlir/Silicon/TTMetal/to_layout.mlir    |   8 +-
 .../TTNN/sharded/simple_eltwise_sharded.mlir  | 119 +++++++++
 test/ttmlir/Silicon/TTNN/simple_matmul.mlir   |   2 +-
 test/ttmlir/Silicon/TTNN/simple_nop.mlir      |   2 +-
 35 files changed, 738 insertions(+), 206 deletions(-)
 create mode 100644 test/ttmlir/Silicon/TTNN/sharded/simple_eltwise_sharded.mlir

diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
index 850a378878..9e2cad72eb 100644
--- a/include/ttmlir-c/TTAttrs.h
+++ b/include/ttmlir-c/TTAttrs.h
@@ -49,11 +49,9 @@ MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTSystemDescAttrGet(
     MlirAttribute *chipCoords, size_t chipCoordsSize,
     MlirAttribute *chipChannels, size_t chipChannelsSize);
 
-MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx,
-                                                       MlirAffineMap linear,
-                                                       unsigned oobVal,
-                                                       MlirAttribute grid,
-                                                       MlirType memref);
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear, unsigned oobVal,
+                      MlirAttribute grid, MlirType memref, unsigned memLayout);
 
 MLIR_CAPI_EXPORTED MlirAttribute
 ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace);
@@ -61,6 +59,9 @@ ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace);
 MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx,
                                                        uint32_t oobVal);
 
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTTensorMemoryLayoutAttrGet(MlirContext ctx, uint32_t memLayout);
+
 MLIR_CAPI_EXPORTED MlirAttribute
 ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType);
 
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
index 251078bc05..d48f7234d9 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
@@ -72,6 +72,26 @@ def TT_MemorySpace : I32EnumAttr<"MemorySpace", "TT MemorySpace",
   let cppNamespace = "::mlir::tt";
 }
 
+def TT_NoneLayout : I32EnumAttrCase<"NoneLayout", 0, "none_layout">;
+def TT_Interleaved : I32EnumAttrCase<"Interleaved", 1, "interleaved">;
+def TT_SingleBank : I32EnumAttrCase<"SingleBank", 2, "single_bank">;
+def TT_HeightSharded : I32EnumAttrCase<"HeightSharded", 3, "height_sharded">;
+def TT_WidthSharded : I32EnumAttrCase<"WidthSharded", 4, "width_sharded">;
+def TT_BlockSharded : I32EnumAttrCase<"BlockSharded", 5, "block_sharded">;
+
+def TT_TensorMemoryLayout : I32EnumAttr<"TensorMemoryLayout", "TT TensorMemoryLayout",
+                           [
+                            TT_NoneLayout,
+                            TT_Interleaved,
+                            TT_SingleBank,
+                            TT_HeightSharded,
+                            TT_WidthSharded,
+                            TT_BlockSharded,
+                           ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::tt";
+}
+
 def TT_Parallel : I32EnumAttrCase<"Parallel", 0, "parallel">;
 def TT_Systolic : I32EnumAttrCase<"Systolic", 1, "systolic">;
 def TT_Broadcast : I32EnumAttrCase<"Broadcast", 2, "broadcast">;
@@ -109,10 +129,17 @@ def TT_OperandConstraintDRAM : I32BitEnumAttrCaseBit<"DRAM", 1, "dram">;
 def TT_OperandConstraintL1 : I32BitEnumAttrCaseBit<"L1", 2, "l1">;
 def TT_OperandConstraintScalar : I32BitEnumAttrCaseBit<"Scalar", 3, "scalar">;
 def TT_OperandConstraintTile : I32BitEnumAttrCaseBit<"Tile", 4, "tile">;
-def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile], "any">;
-def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile], "any_device">;
-def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile], "any_device_tile">;
-
+def TT_OperandConstraintNoneLayout : I32BitEnumAttrCaseBit<"NoneLayout", 5, "none_layout">;
+def TT_OperandConstraintInterleaved : I32BitEnumAttrCaseBit<"Interleaved", 6, "interleaved">;
+def TT_OperandConstraintSingleBank : I32BitEnumAttrCaseBit<"SingleBank", 7, "single_bank">;
+def TT_OperandConstraintHeightSharded : I32BitEnumAttrCaseBit<"HeightSharded", 8, "height_sharded">;
+def TT_OperandConstraintWidthSharded : I32BitEnumAttrCaseBit<"WidthSharded", 9, "width_sharded">;
+def TT_OperandConstraintBlockSharded : I32BitEnumAttrCaseBit<"BlockSharded", 10, "block_sharded">;
+def TT_OperandConstraintAnyLayout : I32BitEnumAttrCaseGroup<"AnyLayout", [TT_OperandConstraintNoneLayout, TT_OperandConstraintInterleaved, TT_OperandConstraintSingleBank, TT_OperandConstraintHeightSharded, TT_OperandConstraintWidthSharded, TT_OperandConstraintBlockSharded], "any_layout">;
+def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any">;
+def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device">;
+def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device_tile">;
+def TT_OperandConstraintL1BlockSharded : I32BitEnumAttrCaseGroup<"L1BlockSharded", [TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintBlockSharded], "l1_block_sharded">;
 def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Constraints",
                            [
                             TT_OperandConstraintSystem,
@@ -120,9 +147,17 @@ def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Const
                             TT_OperandConstraintL1,
                             TT_OperandConstraintScalar,
                             TT_OperandConstraintTile,
+                            TT_OperandConstraintNoneLayout,
+                            TT_OperandConstraintInterleaved,
+                            TT_OperandConstraintSingleBank,
+                            TT_OperandConstraintHeightSharded,
+                            TT_OperandConstraintWidthSharded,
+                            TT_OperandConstraintBlockSharded,
+                            TT_OperandConstraintAnyLayout,
                             TT_OperandConstraintAny,
                             TT_OperandConstraintAnyDevice,
                             TT_OperandConstraintAnyDeviceTile,
+                            TT_OperandConstraintL1BlockSharded,
                            ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::tt";
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
index d6f2ce72d1..a48e1ddede 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
@@ -22,6 +22,16 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) {
          memorySpace == MemorySpace::DeviceL1;
 }
 
+inline bool isL1MemorySpace(MemorySpace memorySpace) {
+  return memorySpace == MemorySpace::DeviceL1;
+}
+
+inline bool isShardedMemoryLayout(TensorMemoryLayout layout) {
+  return layout == TensorMemoryLayout::HeightSharded ||
+         layout == TensorMemoryLayout::WidthSharded ||
+         layout == TensorMemoryLayout::BlockSharded;
+}
+
 inline void printDimensionList(::mlir::AsmPrinter &printer,
                                ::llvm::ArrayRef<int64_t> shape) {
   printer.printDimensionList(shape);
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
index 35c6110767..9fc6da319a 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -245,8 +245,9 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
   let parameters = (ins AttrParameter<"AffineMap", "An affine map that defines how the logical tensor dimensions map to a grid shape.">:$linear,
                         AttrParameter<"OOBVal", "A tracked out of bounds value that fills padding space.">:$oob_val,
                         AttrParameter<"GridAttr", "The grid shape that this tensor is divided onto.">:$grid,
-                        AttrParameter<"MemRefType", "A memref that describes the physical footprint allocation of the shard. It must also have a shape with rank equal to grid.">:$memref);
-  let assemblyFormat = "`<` $linear`,` $oob_val`,` $grid`,` $memref `>`";
+                        AttrParameter<"MemRefType", "A memref that describes the physical footprint allocation of the shard. It must also have a shape with rank equal to grid.">:$memref,
+                        AttrParameter<"TensorMemoryLayout", "The layout of the tensor in memory.">:$mem_layout);
+  let assemblyFormat = "`<` $linear`,` $oob_val`,` $grid`,` $memref`,` $mem_layout `>`";
 
   let extraClassDeclaration = [{
       static LayoutAttr get(::mlir::MLIRContext *context,
@@ -255,18 +256,21 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
                             MemorySpace memorySpace = MemorySpace::System,
                             GridAttr grid = {},
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
-                            OOBVal oobVal = OOBVal::Undef);
+                            OOBVal oobVal = OOBVal::Undef,
+                            TensorMemoryLayout memLayout = TensorMemoryLayout::NoneLayout);
       static LayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace = MemorySpace::System,
                             GridAttr grid = {},
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
-                            OOBVal oobVal = OOBVal::Undef);
+                            OOBVal oobVal = OOBVal::Undef,
+                            TensorMemoryLayout memLayout = TensorMemoryLayout::NoneLayout);
       static LayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace,
                             GridAttr grid,
-                            Type elementType);
+                            Type elementType,
+                            TensorMemoryLayout memLayout);
       LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
       LayoutAttr withGrid(::mlir::MLIRContext *context,
                           RankedTensorType ty,
@@ -274,10 +278,11 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
                           ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
       LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
       LayoutAttr withMemorySpace(::mlir::MLIRContext *context, MemorySpace memorySpace);
-
+      LayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
       MemorySpace getMemorySpace() const;
       bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); }
       bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); }
+      bool hasShardedTensorMemoryLayout() const;
       bool isTiled() const;
       Type getElementType() const;
       Type getScalarElementType() const;
@@ -337,6 +342,10 @@ def TT_MemorySpaceAttr : EnumAttr<TT_Dialect, TT_MemorySpace, "memory_space"> {
   let assemblyFormat = "`<` $value `>`";
 }
 
+def TT_TensorMemoryLayoutAttr : EnumAttr<TT_Dialect, TT_TensorMemoryLayout, "tensor_memory_layout"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 def TT_OOBValAttr : EnumAttr<TT_Dialect, TT_OOBVal, "oob_val"> {
   let assemblyFormat = "`<` $value `>`";
 }
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index edfee781a3..51081f759a 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -72,8 +72,8 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpI
         - Some combination of the above
 
       ```llvm
-      #layout = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>>
-      #layout1 = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>>
+      #layout = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>, none_layout>
+      #layout1 = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>, none_layout>
       %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
       ```
     }];
@@ -89,8 +89,17 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpI
         // TODO return below, but we need a way to properly create an ArrayAttr:
         // return {OperandConstraint::Any, OperandConstraint::Any};
       }
-      // Returns a tuple of booleans indicating if the op changes layout, grid, format, or memory space.
-      std::tuple<bool, bool, bool, bool> compoundComponents();
+
+      struct CompoundComponents {
+        bool isLayoutChange;
+        bool isGridChange;
+        bool isFormatChange;
+        bool isMemorySpaceChange;
+        bool isMemoryLayoutChange;
+      };
+
+      // Returns booleans indicating if the op changes layout, grid, format, memory space or memory layout.
+      CompoundComponents compoundComponents();
     }];
 
     let hasVerifier = 1;
diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
index c5a67e76c5..08f5451d63 100644
--- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
@@ -51,6 +51,10 @@ def TTIRLayout: Pass<"ttir-layout", "::mlir::ModuleOp"> {
           "::mlir::tt::MemorySpace",
           /*default=*/"::mlir::tt::MemorySpace::DeviceDRAM",
            "Set the default memory space for layout pass to prefer for operation operands, if not constrained">,
+    Option<"defaultDeviceMemoryLayout", "default-device-memory-layout",
+          "::mlir::tt::TensorMemoryLayout",
+          /*default=*/"::mlir::tt::TensorMemoryLayout::Interleaved",
+          "Set the default memory layout for layout pass to prefer for operation operands that are on device, if not constrained">
   ];
 }
 
@@ -58,7 +62,7 @@ def TTIRSplitCompoundLayout: Pass<"ttir-split-compound-layout", "::mlir::ModuleO
   let summary = "Split compound layouts.";
   let description = [{
     A single to_layout op in ttir can simultaneously perform multiple layout transformations
-    at once, including changing layout, format, or memory space. This pass splits each of
+    at once, including changing layout, format, memory space or memory layout. This pass splits each of
     these transformation categories into separate to_layout ops.
   }];
 }
diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs
index f3d588c316..2b4b0b78a5 100644
--- a/include/ttmlir/Target/Common/types.fbs
+++ b/include/ttmlir/Target/Common/types.fbs
@@ -51,11 +51,21 @@ enum ChipCapability: uint32 (bit_flags) {
   HostMMIO = 1,
 }
 
+enum TensorMemoryLayout: ushort {
+  NoneLayout = 0,
+  Interleaved = 1,
+  SingleBank = 2,
+  HeightSharded = 3,
+  WidthSharded = 4,
+  BlockSharded = 5,
+}
+
 table MemoryDesc {
   shape: [int];
   tile_shape: Dim2d;
   data_type: DataType;
   memory_space: MemorySpace;
+  memory_layout: TensorMemoryLayout;
   size: uint64;
 }
 
diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
index b56834d25a..a5ef4f1d34 100644
--- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
+++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -31,6 +31,24 @@ inline ::tt::target::OOBVal toFlatbuffer(FlatbufferObjectCache &,
   }
 }
 
+inline ::tt::target::TensorMemoryLayout
+toFlatbuffer(FlatbufferObjectCache &, TensorMemoryLayout memLayout) {
+  switch (memLayout) {
+  case TensorMemoryLayout::NoneLayout:
+    return ::tt::target::TensorMemoryLayout::NoneLayout;
+  case TensorMemoryLayout::Interleaved:
+    return ::tt::target::TensorMemoryLayout::Interleaved;
+  case TensorMemoryLayout::SingleBank:
+    return ::tt::target::TensorMemoryLayout::SingleBank;
+  case TensorMemoryLayout::HeightSharded:
+    return ::tt::target::TensorMemoryLayout::HeightSharded;
+  case TensorMemoryLayout::WidthSharded:
+    return ::tt::target::TensorMemoryLayout::WidthSharded;
+  case TensorMemoryLayout::BlockSharded:
+    return ::tt::target::TensorMemoryLayout::BlockSharded;
+  }
+}
+
 inline std::uint64_t getElementSizeBytes(DataType dtype) {
   switch (dtype) {
   case DataType::Float32:
@@ -344,7 +362,8 @@ arrayAttrToFlatbuffer(FlatbufferObjectCache &cache,
 }
 
 inline flatbuffers::Offset<::tt::target::MemoryDesc>
-memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref) {
+memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref,
+                       ::mlir::tt::TensorMemoryLayout memLayout) {
   auto shapeInt64 = memref.getShape();
   std::vector<int32_t> shape(shapeInt64.begin(), shapeInt64.end());
   DataType dtype = DataType::Float32;
@@ -360,6 +379,7 @@ memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref) {
     dtype = elementTypeToDataType(elementType);
     elementSize = getElementSizeBytes(dtype);
   }
+
   std::uint64_t size = elementSize;
   for (auto dim : shapeInt64) {
     size *= dim;
@@ -370,7 +390,7 @@ memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref) {
       toFlatbuffer(
           cache,
           mlir::cast<MemorySpaceAttr>(memref.getMemorySpace()).getValue()),
-      size);
+      toFlatbuffer(cache, memLayout), size);
 }
 
 inline flatbuffers::Offset<::tt::target::LayoutDesc>
@@ -385,7 +405,8 @@ layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
   return ::tt::target::CreateLayoutDescDirect(
       *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
       &coreRangeSet,
-      cache.getOrCreate(layoutAttr.getMemref(), memrefAttrToFlatbuffer));
+      cache.getOrCreate(layoutAttr.getMemref(), memrefAttrToFlatbuffer,
+                        layoutAttr.getMemLayout()));
 }
 
 inline flatbuffers::Offset<::tt::target::TensorDesc>
diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp
index e3bee6e056..9bdfd82404 100644
--- a/lib/CAPI/TTAttrs.cpp
+++ b/lib/CAPI/TTAttrs.cpp
@@ -113,12 +113,13 @@ MlirAttribute ttmlirTTSystemDescAttrGet(
 
 MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
                                     unsigned oobVal, MlirAttribute grid,
-                                    MlirType memref) {
+                                    MlirType memref, unsigned memLayout) {
   mlir::AffineMap affineMap = mlir::AffineMap::getFromOpaquePointer(linear.ptr);
   return wrap(LayoutAttr::get(unwrap(ctx), affineMap,
                               static_cast<OOBVal>(oobVal),
                               mlir::cast<GridAttr>(unwrap(grid)),
-                              mlir::cast<MemRefType>(unwrap(memref))));
+                              mlir::cast<MemRefType>(unwrap(memref)),
+                              static_cast<TensorMemoryLayout>(memLayout)));
 }
 
 MlirAttribute ttmlirTTMemorySpaceAttrGet(MlirContext ctx,
@@ -131,6 +132,12 @@ MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx, uint32_t oobVal) {
   return wrap(OOBValAttr::get(unwrap(ctx), static_cast<tt::OOBVal>(oobVal)));
 }
 
+MlirAttribute ttmlirTTTensorMemoryLayoutAttrGet(MlirContext ctx,
+                                                uint32_t memLayout) {
+  return wrap(TensorMemoryLayoutAttr::get(
+      unwrap(ctx), static_cast<tt::TensorMemoryLayout>(memLayout)));
+}
+
 MlirAttribute ttmlirTTIteratorTypeAttrGet(MlirContext ctx,
                                           uint32_t iteratorType) {
   return wrap(IteratorTypeAttr::get(
diff --git a/lib/Dialect/TT/IR/TTDialect.cpp b/lib/Dialect/TT/IR/TTDialect.cpp
index 9bf2293bda..fb3dd31324 100644
--- a/lib/Dialect/TT/IR/TTDialect.cpp
+++ b/lib/Dialect/TT/IR/TTDialect.cpp
@@ -39,6 +39,8 @@ struct TTOpAsmDialectInterface : public OpAsmDialectInterface {
         os << "any_device";
       } else if (value == OperandConstraint::AnyDeviceTile) {
         os << "any_device_tile";
+      } else if (value == OperandConstraint::L1BlockSharded) {
+        os << "l1_block_sharded";
       } else {
         os << "operand_constraint";
       }
diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
index 35f48177c6..c3866b34cc 100644
--- a/lib/Dialect/TT/IR/TTOpsTypes.cpp
+++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -449,7 +449,7 @@ LayoutAttr LayoutAttr::get(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape,
     Type elementType, MemorySpace memorySpace, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals,
-    OOBVal oobVal) {
+    OOBVal oobVal, TensorMemoryLayout memLayout) {
   if (not grid) {
     grid = GridAttr::get(context, tensorShape.size());
   }
@@ -459,26 +459,26 @@ LayoutAttr LayoutAttr::get(
   auto shardShape =
       calculateLogicalShardShape(context, tensorShape, linear, grid);
   auto memref = buildMemRef(context, shardShape, elementType, memorySpace);
-  return get(context, linear, oobVal, grid, memref);
+  return get(context, linear, oobVal, grid, memref, memLayout);
 }
 
 LayoutAttr LayoutAttr::get(
     ::mlir::MLIRContext *context, RankedTensorType ty, MemorySpace memorySpace,
     GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals,
-    OOBVal oobVal) {
+    OOBVal oobVal, TensorMemoryLayout memLayout) {
   assert(ty);
   return get(context, ty.getShape(), ty.getElementType(), memorySpace, grid,
-             collapseIntervals, oobVal);
+             collapseIntervals, oobVal, memLayout);
 }
 
 LayoutAttr LayoutAttr::get(::mlir::MLIRContext *context, RankedTensorType ty,
                            MemorySpace memorySpace, GridAttr grid,
-                           Type elementType) {
+                           Type elementType, TensorMemoryLayout memLayout) {
   assert(ty);
   assert(grid);
   return get(context, ty.getShape(), elementType, memorySpace, grid, {{0, -1}},
-             OOBVal::Undef);
+             OOBVal::Undef, memLayout);
 }
 
 // From the logical shape of the tensor and the affine map of the layout,
@@ -573,6 +573,13 @@ mlir::Type LayoutAttr::getScalarElementType() const {
   return elementType;
 }
 
+bool LayoutAttr::hasShardedTensorMemoryLayout() const {
+  return ::mlir::tt::isL1MemorySpace(getMemorySpace()) and
+         (getMemLayout() == TensorMemoryLayout::HeightSharded or
+          getMemLayout() == TensorMemoryLayout::WidthSharded or
+          getMemLayout() == TensorMemoryLayout::BlockSharded);
+}
+
 bool LayoutAttr::isTiled() const {
   return ::mlir::isa<::mlir::tt::TileType>(getElementType());
 }
@@ -590,7 +597,7 @@ LayoutAttr LayoutAttr::withGrid(
     ::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid,
     ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals) {
   return get(context, tensorShape, getElementType(), getMemorySpace(), grid,
-             collapseIntervals, getOobVal());
+             collapseIntervals, getOobVal(), getMemLayout());
 }
 
 LayoutAttr LayoutAttr::withGrid(
@@ -604,14 +611,24 @@ LayoutAttr LayoutAttr::withElementType(::mlir::MLIRContext *context,
                                        Type elementType) {
   return LayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
-      buildMemRef(context, getShardShape(), elementType, getMemorySpace()));
+      buildMemRef(context, getShardShape(), elementType, getMemorySpace()),
+      getMemLayout());
 }
 
 LayoutAttr LayoutAttr::withMemorySpace(::mlir::MLIRContext *context,
                                        MemorySpace memorySpace) {
   return LayoutAttr::get(
       context, getLinear(), getOobVal(), getGrid(),
-      buildMemRef(context, getShardShape(), getElementType(), memorySpace));
+      buildMemRef(context, getShardShape(), getElementType(), memorySpace),
+      getMemLayout());
+}
+
+LayoutAttr LayoutAttr::withMemoryLayout(::mlir::MLIRContext *context,
+                                        TensorMemoryLayout memLayout) {
+  return LayoutAttr::get(
+      context, getLinear(), getOobVal(), getGrid(),
+      buildMemRef(context, getShardShape(), getElementType(), getMemorySpace()),
+      memLayout);
 }
 
 MemorySpace LayoutAttr::getMemorySpace() const {
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index e26d50e8d8..4581c665a8 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -32,7 +32,7 @@ ::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() {
   return success();
 }
 
-std::tuple<bool, bool, bool, bool>
+mlir::tt::ttir::ToLayoutOp::CompoundComponents
 mlir::tt::ttir::ToLayoutOp::compoundComponents() {
   auto inputLayout =
       mlir::cast<tt::LayoutAttr>(getInput().getType().getEncoding());
@@ -47,8 +47,10 @@ mlir::tt::ttir::ToLayoutOp::compoundComponents() {
       inputLayout.getElementType() != outputLayout.getElementType();
   bool isMemorySpaceChange =
       inputLayout.getMemorySpace() != outputLayout.getMemorySpace();
-  return std::make_tuple(isLayoutChange, isGridChange, isFormatChange,
-                         isMemorySpaceChange);
+  bool isMemoryLayoutChange =
+      inputLayout.getMemLayout() != outputLayout.getMemLayout();
+  return {isLayoutChange, isGridChange, isFormatChange, isMemorySpaceChange,
+          isMemoryLayoutChange};
 }
 
 ::mlir::LogicalResult mlir::tt::ttir::GenericOp::verify() {
diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp
index 9e77b4c66c..2abfa4722f 100644
--- a/lib/Dialect/TTIR/Transforms/Passes.cpp
+++ b/lib/Dialect/TTIR/Transforms/Passes.cpp
@@ -433,6 +433,24 @@ memorySpaceAsOperandConstraint(MemorySpace memorySpace) {
   }
 }
 
+inline OperandConstraint
+memoryLayoutAsOperandConstraint(TensorMemoryLayout memoryLayout) {
+  switch (memoryLayout) {
+  case TensorMemoryLayout::NoneLayout:
+    return OperandConstraint::NoneLayout;
+  case TensorMemoryLayout::Interleaved:
+    return OperandConstraint::Interleaved;
+  case TensorMemoryLayout::SingleBank:
+    return OperandConstraint::SingleBank;
+  case TensorMemoryLayout::HeightSharded:
+    return OperandConstraint::HeightSharded;
+  case TensorMemoryLayout::WidthSharded:
+    return OperandConstraint::WidthSharded;
+  case TensorMemoryLayout::BlockSharded:
+    return OperandConstraint::BlockSharded;
+  }
+}
+
 inline MemorySpace getLegalMemorySpace(OperandConstraint operandConstraint,
                                        MemorySpace defaultMemorySpace) {
   if (bitEnumContainsAny(operandConstraint,
@@ -448,6 +466,39 @@ inline MemorySpace getLegalMemorySpace(OperandConstraint operandConstraint,
   return MemorySpace::System;
 }
 
+inline TensorMemoryLayout
+getLegalTensorMemoryLayout(OperandConstraint operandConstraint,
+                           MemorySpace targetMemorySpace,
+                           TensorMemoryLayout defaultDeviceMemLayout) {
+  if (defaultDeviceMemLayout == TensorMemoryLayout::NoneLayout) {
+    return TensorMemoryLayout::NoneLayout;
+  }
+  if (isSystemMemorySpace(targetMemorySpace)) {
+    return TensorMemoryLayout::NoneLayout;
+  } else {
+    assert(isDeviceMemorySpace(targetMemorySpace));
+    if (bitEnumContainsAny(operandConstraint, memoryLayoutAsOperandConstraint(
+                                                  defaultDeviceMemLayout))) {
+      return defaultDeviceMemLayout;
+    }
+  }
+
+  std::map<OperandConstraint, TensorMemoryLayout> validLayoutsMap = {
+      {OperandConstraint::Interleaved, TensorMemoryLayout::Interleaved},
+      {OperandConstraint::SingleBank, TensorMemoryLayout::SingleBank},
+      {OperandConstraint::HeightSharded, TensorMemoryLayout::HeightSharded},
+      {OperandConstraint::WidthSharded, TensorMemoryLayout::WidthSharded},
+      {OperandConstraint::BlockSharded, TensorMemoryLayout::BlockSharded}};
+
+  for (const auto &[constraintLayout, memLayout] : validLayoutsMap) {
+    if (bitEnumContainsAny(operandConstraint, constraintLayout)) {
+      return memLayout;
+    }
+  }
+
+  return TensorMemoryLayout::NoneLayout;
+}
+
 class TTIRLayoutTensorTypeConverter : public TypeConverter {
 public:
   TTIRLayoutTensorTypeConverter(MLIRContext *ctx, MemorySpace initMemorySpace,
@@ -532,24 +583,28 @@ class TTIRLayoutTensorTypeRewriter : public RewritePattern {
   const TypeConverter *converter;
 };
 
-static std::optional<Value> createToLayoutOp(PatternRewriter &rewriter,
-                                             Location loc, Value input,
-                                             MemorySpace desiredMemorySpace,
-                                             bool tiled) {
+static std::optional<Value>
+createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
+                 MemorySpace desiredMemorySpace,
+                 TensorMemoryLayout desiredMemLayout, bool tiled) {
+
   auto ty = mlir::cast<RankedTensorType>(input.getType());
   auto currLayout = mlir::cast<LayoutAttr>(ty.getEncoding());
   auto currMemorySpace = currLayout.getMemorySpace();
   auto currElementType = currLayout.getElementType();
+  auto currMemLayout = currLayout.getMemLayout();
   auto desiredElementType =
       tiled ? rewriter.getType<TileType>(ty.getElementType())
             : ty.getElementType();
   if (currMemorySpace == desiredMemorySpace &&
-      currElementType == desiredElementType) {
+      currElementType == desiredElementType &&
+      currMemLayout == desiredMemLayout) {
     return std::nullopt;
   }
 
-  auto desiredLayout = rewriter.getAttr<LayoutAttr>(
-      ty, desiredMemorySpace, currLayout.getGrid(), desiredElementType);
+  auto desiredLayout =
+      rewriter.getAttr<LayoutAttr>(ty, desiredMemorySpace, currLayout.getGrid(),
+                                   desiredElementType, desiredMemLayout);
   auto output = rewriter.create<tensor::EmptyOp>(
       loc, ty.getShape(), ty.getElementType(), desiredLayout);
 
@@ -566,21 +621,29 @@ static std::optional<Value> createToLayoutOp(PatternRewriter &rewriter,
 static std::optional<Value>
 createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
                  OperandConstraint operandConstraint,
-                 MemorySpace defaultMemorySpace) {
+                 MemorySpace defaultMemorySpace,
+                 TensorMemoryLayout defaultDeviceMemoryLayout) {
   auto desiredMemorySpace =
       getLegalMemorySpace(operandConstraint, defaultMemorySpace);
+
+  auto desiredMemoryLayout = getLegalTensorMemoryLayout(
+      operandConstraint, desiredMemorySpace, defaultDeviceMemoryLayout);
+
   bool tiled =
       !bitEnumContainsAny(operandConstraint, OperandConstraint::Scalar);
-  return createToLayoutOp(rewriter, loc, input, desiredMemorySpace, tiled);
+  return createToLayoutOp(rewriter, loc, input, desiredMemorySpace,
+                          desiredMemoryLayout, tiled);
 }
 
 class TTIRLayoutDPSOperandsRewriter
     : public OpInterfaceRewritePattern<DestinationStyleOpInterface> {
 public:
   TTIRLayoutDPSOperandsRewriter(MLIRContext *ctx,
-                                MemorySpace defaultMemorySpace)
+                                MemorySpace defaultMemorySpace,
+                                TensorMemoryLayout defaultDeviceMemoryLayout)
       : OpInterfaceRewritePattern<DestinationStyleOpInterface>(ctx),
-        defaultMemorySpace(defaultMemorySpace) {}
+        defaultMemorySpace(defaultMemorySpace),
+        defaultDeviceMemoryLayout(defaultDeviceMemoryLayout) {}
 
   LogicalResult matchAndRewrite(DestinationStyleOpInterface op,
                                 PatternRewriter &rewriter) const final {
@@ -604,9 +667,9 @@ class TTIRLayoutDPSOperandsRewriter
               mlir::cast<TTIROp>(op.getOperation())
                   .getOperandConstraints()[operand.getOperandNumber()])
               .getValue();
-      auto desiredLayout =
-          createToLayoutOp(rewriter, op.getLoc(), operand.get(),
-                           operandConstraint, defaultMemorySpace);
+      auto desiredLayout = createToLayoutOp(
+          rewriter, op.getLoc(), operand.get(), operandConstraint,
+          defaultMemorySpace, defaultDeviceMemoryLayout);
 
       if (desiredLayout) {
         rewriter.modifyOpInPlace(op, [&]() {
@@ -625,14 +688,17 @@ class TTIRLayoutDPSOperandsRewriter
 
 private:
   MemorySpace defaultMemorySpace;
+  TensorMemoryLayout defaultDeviceMemoryLayout;
 };
 
 class TTIRLayoutFuncReturnRewriter
     : public OpRewritePattern<mlir::func::ReturnOp> {
 public:
-  TTIRLayoutFuncReturnRewriter(MLIRContext *ctx, MemorySpace initMemorySpace)
+  TTIRLayoutFuncReturnRewriter(MLIRContext *ctx, MemorySpace initMemorySpace,
+                               TensorMemoryLayout defaultDeviceMemoryLayout)
       : OpRewritePattern<mlir::func::ReturnOp>(ctx),
-        initMemorySpace(initMemorySpace) {}
+        initMemorySpace(initMemorySpace),
+        defaultDeviceMemoryLayout(defaultDeviceMemoryLayout) {}
 
   LogicalResult matchAndRewrite(mlir::func::ReturnOp op,
                                 PatternRewriter &rewriter) const final {
@@ -641,8 +707,13 @@ class TTIRLayoutFuncReturnRewriter
       // Leave the return values in initMemorySpace, optimizer might decide
       // otherwise
       bool tiled = false;
-      if (auto layout = createToLayoutOp(rewriter, op.getLoc(), operand.get(),
-                                         initMemorySpace, tiled);
+      TensorMemoryLayout initMemoryLayout = TensorMemoryLayout::NoneLayout;
+      if (isDeviceMemorySpace(initMemorySpace)) {
+        initMemoryLayout = defaultDeviceMemoryLayout;
+      }
+      if (auto layout =
+              createToLayoutOp(rewriter, op.getLoc(), operand.get(),
+                               initMemorySpace, initMemoryLayout, tiled);
           layout) {
         rewriter.modifyOpInPlace(
             op, [&]() { op.setOperand(operand.getOperandNumber(), *layout); });
@@ -654,6 +725,7 @@ class TTIRLayoutFuncReturnRewriter
 
 private:
   MemorySpace initMemorySpace;
+  TensorMemoryLayout defaultDeviceMemoryLayout;
 };
 
 class TTIRLayout : public impl::TTIRLayoutBase<TTIRLayout> {
@@ -676,10 +748,10 @@ class TTIRLayout : public impl::TTIRLayoutBase<TTIRLayout> {
     }
     {
       RewritePatternSet patterns(&getContext());
-      patterns.add<TTIRLayoutDPSOperandsRewriter>(&getContext(),
-                                                  defaultMemorySpace);
-      patterns.add<TTIRLayoutFuncReturnRewriter>(&getContext(),
-                                                 initMemorySpace);
+      patterns.add<TTIRLayoutDPSOperandsRewriter>(
+          &getContext(), defaultMemorySpace, defaultDeviceMemoryLayout);
+      patterns.add<TTIRLayoutFuncReturnRewriter>(&getContext(), initMemorySpace,
+                                                 defaultDeviceMemoryLayout);
       FrozenRewritePatternSet patternSet(std::move(patterns));
       if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) {
         signalPassFailure();
@@ -719,12 +791,12 @@ class TTIRSplitCompoundLayoutRewriter : public OpRewritePattern<ToLayoutOp> {
 
   LogicalResult matchAndRewrite(ToLayoutOp op,
                                 PatternRewriter &rewriter) const final {
-    auto [isLayoutChange, isGridChange, isFormatChange, isMemorySpaceChange] =
-        op.compoundComponents();
-    bool isCompound =
-        (static_cast<int>(isLayoutChange) + static_cast<int>(isGridChange) +
-         static_cast<int>(isFormatChange) +
-         static_cast<int>(isMemorySpaceChange)) > 1;
+    auto components = op.compoundComponents();
+    bool isCompound = (static_cast<int>(components.isLayoutChange) +
+                       static_cast<int>(components.isGridChange) +
+                       static_cast<int>(components.isFormatChange) +
+                       static_cast<int>(components.isMemorySpaceChange) +
+                       static_cast<int>(components.isMemoryLayoutChange)) > 1;
 
     if (!isCompound) {
       return failure();
@@ -763,19 +835,23 @@ class TTIRSplitCompoundLayoutRewriter : public OpRewritePattern<ToLayoutOp> {
                inputLayout.withElementType(rewriter.getContext(),
                                            outputLayout.getElementType()));
       }
-    } else if (isLayoutChange && inputLayout.isTiled()) {
+    } else if (components.isLayoutChange && inputLayout.isTiled()) {
       // For now to flexibly support layout changes, we need to bounce to scalar
       // first
       bounce(rewriter, op,
              inputLayout.withElementType(rewriter.getContext(),
                                          inputLayout.getScalarElementType()));
-    } else if (isGridChange) {
-      assert(!isLayoutChange &&
+    } else if (components.isGridChange) {
+      assert(!components.isLayoutChange &&
              "Changing layout and grid at the same time is currently "
              "not supported");
       bounce(rewriter, op,
              outputLayout.withGrid(rewriter.getContext(), outputType,
                                    inputLayout.getGrid()));
+    } else if (components.isMemoryLayoutChange) {
+      bounce(rewriter, op,
+             inputLayout.withMemoryLayout(rewriter.getContext(),
+                                          outputLayout.getMemLayout()));
     } else {
       // Note we should eventually support DRAM <-> DRAM, or System <-> System
       // w/ format conversion via streaming supported
diff --git a/lib/Dialect/TTMetal/Transforms/Passes.cpp b/lib/Dialect/TTMetal/Transforms/Passes.cpp
index 76f7763a8c..81ee56e151 100644
--- a/lib/Dialect/TTMetal/Transforms/Passes.cpp
+++ b/lib/Dialect/TTMetal/Transforms/Passes.cpp
@@ -364,15 +364,17 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
     auto inputLayout = mlir::cast<tt::LayoutAttr>(inputTy.getEncoding());
     auto outputLayout = mlir::cast<tt::LayoutAttr>(outputTy.getEncoding());
 
-    auto [isLayoutChange, isGridChange, isFormatChange, isMemorySpaceChange] =
-        op.compoundComponents();
-    bool isCompound =
-        (static_cast<int>(isLayoutChange) + static_cast<int>(isGridChange) +
-         static_cast<int>(isFormatChange) +
-         static_cast<int>(isMemorySpaceChange)) > 1;
-    assert(!isCompound && "Only one change is allowed");
+    auto components = op.compoundComponents();
+    bool isCompound = (static_cast<int>(components.isLayoutChange) +
+                       static_cast<int>(components.isGridChange) +
+                       static_cast<int>(components.isFormatChange) +
+                       static_cast<int>(components.isMemorySpaceChange) +
+                       static_cast<int>(components.isMemoryLayoutChange)) > 1;
 
-    if (isMemorySpaceChange) {
+    assert(!isCompound && "Only one change is allowed");
+    assert(!components.isMemoryLayoutChange &&
+           "Tensor memory layout shouldn't change in metal backend");
+    if (components.isMemorySpaceChange) {
       if (inputLayout.isSystemMemorySpace()) {
         assert(outputLayout.isDeviceMemorySpace());
         rewriter.replaceOpWithNewOp<ttmetal::HostWriteOp>(
@@ -384,10 +386,10 @@ class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
       } else {
         assert(false && "L1 <-> DRAM not supported yet");
       }
-    } else if (isLayoutChange || isGridChange) {
+    } else if (components.isLayoutChange || components.isGridChange) {
       return relayout(op, rewriter);
     } else {
-      assert(isFormatChange);
+      assert(components.isFormatChange);
       return reformat(op, rewriter);
     }
     return failure();
@@ -841,6 +843,8 @@ void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) {
   mlir::tt::ttir::TTIRLayoutOptions layoutOptions;
   layoutOptions.initMemorySpace = mlir::tt::MemorySpace::DeviceL1;
   layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceL1;
+  layoutOptions.defaultDeviceMemoryLayout =
+      mlir::tt::TensorMemoryLayout::NoneLayout;
   pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions));
   pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref());
   pm.addPass(mlir::tt::ttir::createTTIRAllocate());
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 81d3c5c8b3..f8e8301b7b 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -14,6 +14,14 @@
 
 namespace mlir::tt::ttnn {
 
+constexpr int TTNN_TILE_HEIGHT = 32;
+constexpr int TTNN_TILE_WIDTH = 32;
+
+static bool isValidDeviceLayout(::mlir::tt::TensorMemoryLayout layout) {
+  return layout == ::mlir::tt::TensorMemoryLayout::Interleaved ||
+         ::mlir::tt::isShardedMemoryLayout(layout);
+}
+
 ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
   ::mlir::RankedTensorType inputTy = getInput().getType();
   ::mlir::RankedTensorType outputTy = getResult().getType();
@@ -27,6 +35,40 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
   if (not outputLayout) {
     return emitOpError("Output tensor type missing layout attribute");
   }
+  ::mlir::tt::MemorySpace outputMemorySpace = outputLayout.getMemorySpace();
+  ::mlir::tt::TensorMemoryLayout outputMemoryLayout =
+      outputLayout.getMemLayout();
+  if (::mlir::tt::isSystemMemorySpace(outputMemorySpace) &&
+      outputMemoryLayout != ::mlir::tt::TensorMemoryLayout::NoneLayout) {
+    return emitOpError("System memory space only supports undef memory layout");
+  } else if (::mlir::tt::isDeviceMemorySpace(outputMemorySpace) &&
+             !isValidDeviceLayout(outputMemoryLayout)) {
+    return emitOpError("Device memory space only supports interleaved or "
+                       "sharded memory layouts");
+  }
+
+  if (outputMemorySpace == ::mlir::tt::MemorySpace::DeviceDRAM &&
+      outputMemoryLayout != ::mlir::tt::TensorMemoryLayout::Interleaved) {
+    return emitOpError(
+        "Device DRAM memory space only supports interleaved memory layout");
+  }
+
+  if (outputLayout.hasShardedTensorMemoryLayout()) {
+    if (outputMemoryLayout != ::mlir::tt::TensorMemoryLayout::BlockSharded) {
+      return emitOpError("Currently only block sharding is supported for "
+                         "sharded memory layouts");
+    }
+    ::llvm::SmallVector<int64_t> shardShape = outputLayout.getShardShape();
+    // Currently TTNN backend only supports 2D shard shape
+    if (shardShape.size() != 2) {
+      return emitOpError("Shard shape must be 2D");
+    }
+    // TTNN tiles are (32, 32), shard shape must evenly divide the tile shape
+    if (shardShape[0] % TTNN_TILE_HEIGHT != 0 or
+        shardShape[1] % TTNN_TILE_WIDTH != 0) {
+      return emitOpError("Shard shape must divide tile shape (32, 32) evenly");
+    }
+  }
   return success();
 }
 
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index 56c05e8fe9..3b34c72739 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -24,6 +24,8 @@ void createTTIRToTTNNBackendPipeline(
   mlir::tt::ttir::TTIRLayoutOptions layoutOptions;
   layoutOptions.initMemorySpace = mlir::tt::MemorySpace::System;
   layoutOptions.defaultMemorySpace = mlir::tt::MemorySpace::DeviceDRAM;
+  layoutOptions.defaultDeviceMemoryLayout =
+      mlir::tt::TensorMemoryLayout::Interleaved;
   pm.addPass(mlir::tt::ttir::createTTIRLayout(layoutOptions));
 
   if (options.gridSetPassEnabled) {
diff --git a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
index 0f09ac5bc9..a215c0871f 100644
--- a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
+++ b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp
@@ -83,7 +83,8 @@ ::tt::target::Dim2dRange toFlatbuffer(CoreRangeAttr coreRange) {
 
 ::flatbuffers::Offset<::tt::target::CBDesc>
 cbTypeToFlatbuffer(FlatbufferObjectCache &cache, ttkernel::CBType cbType) {
-  auto memref = cache.getOrCreate(cbType.getMemref(), memrefAttrToFlatbuffer);
+  auto memref = cache.getOrCreate(cbType.getMemref(), memrefAttrToFlatbuffer,
+                                  ::mlir::tt::TensorMemoryLayout::NoneLayout);
   return ::tt::target::CreateCBDesc(
       *cache.fbb,
       static_cast<std::underlying_type_t<ttkernel::CBPort>>(cbType.getPort()),
diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index 1e3841deac..a96c662a2c 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -21,14 +21,14 @@ void populateTTModule(py::module &m) {
                      uint32_t memorySpaceValue, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals,
-                     uint32_t oobValValue) {
+                     uint32_t oobValValue, uint32_t memLayoutValue) {
                     return wrap(tt::LayoutAttr::get(
                         unwrap(ctx),
                         mlir::cast<RankedTensorType>(unwrap(rankedTensorType)),
                         static_cast<tt::MemorySpace>(memorySpaceValue),
                         mlir::cast<tt::GridAttr>(unwrap(grid)),
-                        collapseIntervals,
-                        static_cast<tt::OOBVal>(oobValValue)));
+                        collapseIntervals, static_cast<tt::OOBVal>(oobValValue),
+                        static_cast<tt::TensorMemoryLayout>(memLayoutValue)));
                   })
       .def_static("with_grid",
                   [](MlirContext ctx, MlirAttribute self,
@@ -87,7 +87,8 @@ void populateTTModule(py::module &m) {
       .def_property_readonly("grid_attr", &tt::LayoutAttr::getGrid)
       .def_property_readonly("memref", &tt::LayoutAttr::getMemref)
       .def_property_readonly("memory_space", &tt::LayoutAttr::getMemorySpace)
-      .def_property_readonly("shard_shape", &tt::LayoutAttr::getShardShape);
+      .def_property_readonly("shard_shape", &tt::LayoutAttr::getShardShape)
+      .def_property_readonly("memory_layout", &tt::LayoutAttr::getMemLayout);
 
   py::class_<tt::GridAttr>(m, "GridAttr")
       .def_static("get",
@@ -200,6 +201,12 @@ void populateTTModule(py::module &m) {
             tt::OOBValAttr::get(unwrap(ctx), static_cast<tt::OOBVal>(oobVal)));
       });
 
+  py::class_<tt::TensorMemoryLayoutAttr>(m, "TensorMemoryLayoutAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t memLayout) {
+        return wrap(tt::TensorMemoryLayoutAttr::get(
+            unwrap(ctx), static_cast<tt::TensorMemoryLayout>(memLayout)));
+      });
+
   py::class_<tt::IteratorTypeAttr>(m, "IteratorTypeAttr")
       .def_static("get", [](MlirContext ctx, uint32_t iteratorType) {
         return wrap(tt::IteratorTypeAttr::get(
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index d43eadba16..e1f0b2c9f1 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -49,6 +49,73 @@ static bool isOnDevice(const ::ttnn::Tensor &tensor) {
   return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE;
 }
 
+static CoreRangeSet toCoreRangeSet(
+    const ::flatbuffers::Vector<const tt::target::Dim2dRange *> *coreRangeSet) {
+  std::set<CoreRange> coreRanges;
+  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
+    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
+    // End is inclusive
+    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
+                  coreRange->loc().y() + coreRange->size().y() - 1);
+
+    coreRanges.emplace(start, end);
+  }
+  return CoreRangeSet(coreRanges);
+}
+
+static ::tt::tt_metal::MemoryConfig
+createShardedMemoryConfig(const ::tt::target::TensorMemoryLayout memLayout,
+                          const CoreRangeSet &coreRangeSet,
+                          const std::array<uint32_t, 2> &shardShape) {
+  ::tt::tt_metal::ShardSpec shardSpec(
+      coreRangeSet, shardShape, ::tt::tt_metal::ShardOrientation::ROW_MAJOR,
+      false);
+  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
+      utils::toTTNNTensorMemoryLayout(memLayout);
+  // TODO (jnie): Hardcoding to block sharded for now
+  // Add support for other types once compiler supports it
+  assert(ttnnMemLayout == ::tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED &&
+         "Only block sharded supported for now");
+  return {ttnnMemLayout, ::tt::tt_metal::BufferType::L1, shardSpec};
+}
+
+static ::tt::tt_metal::MemoryConfig
+createL1MemoryConfig(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
+  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
+      layout->memory_desc()->memory_layout();
+  TT_FATAL(
+      targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved or
+          targetMemoryLayout == ::tt::target::TensorMemoryLayout::BlockSharded,
+      "Only interleaved and block sharded memory layouts are supported for L1 "
+      "tensors");
+
+  const ::flatbuffers::Vector<int32_t> *memoryDescShape =
+      layout->memory_desc()->shape();
+  TT_FATAL(memoryDescShape->size() == 2,
+           "Only 2D shard shape is supported in TTNN backend");
+
+  CoreRangeSet coreRangeSet = toCoreRangeSet(layout->core_range_set());
+  TT_FATAL(coreRangeSet.size() == 1,
+           "Currently only single core range/grid is supported");
+
+  if (targetMemoryLayout == ::tt::target::TensorMemoryLayout::Interleaved) {
+    return ::ttnn::L1_MEMORY_CONFIG;
+  }
+
+  std::array<uint32_t, 2> shardShape;
+  std::copy(memoryDescShape->begin(), memoryDescShape->end(),
+            shardShape.begin());
+  TT_FATAL(shardShape[0] % ::tt::constants::TILE_HEIGHT == 0 and
+               shardShape[1] % ::tt::constants::TILE_WIDTH == 0,
+           "Shard shape ({}, {}) does not divide tile shape ({}, {}) evenly",
+           shardShape[0], shardShape[1], ::tt::constants::TILE_HEIGHT,
+           ::tt::constants::TILE_WIDTH);
+
+  return createShardedMemoryConfig(targetMemoryLayout, coreRangeSet,
+                                   shardShape);
+}
+
 static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input,
                                       const ::ttnn::DataType &targetDataType) {
   if (isOnHost(input)) {
@@ -73,6 +140,7 @@ static ::ttnn::Tensor
 updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
                         const ::ttnn::DataType targetDataType,
                         const bool shouldTilize, const bool shouldUntilize) {
+
   ::ttnn::Tensor outputTensor = inputTensor;
   const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType;
   // const int targetTileX = targetTileShape->x();
@@ -83,6 +151,8 @@ updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
   // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and
   //                             inputTensor.get_layout() ==
   //                             ::ttnn::TILE_LAYOUT;
+  TT_FATAL(not(shouldTilize and shouldUntilize),
+           "Cannot tilize and untilize tensor at the same time");
   if (shouldTilize) {
     outputTensor = ::tilize(outputTensor);
   } else if (shouldUntilize) {
@@ -94,18 +164,125 @@ updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
   return outputTensor;
 }
 
+static void handleToHostMemoryConfigOp(
+    const ::ttnn::Tensor &inputTensor,
+    const ::ttnn::DataType &targetDataTypeTTNN, uint32_t outputGlobalId,
+    std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
+    std::list<::ttnn::Tensor> &tensorPool) {
+  ::ttnn::Tensor result;
+  bool shouldTilize, shouldUntilize;
+  if (isOnHost(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = true;
+    result = updateLayoutAndDataType(inputTensor, targetDataTypeTTNN,
+                                     shouldTilize, shouldUntilize);
+  } else if (isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = true;
+    result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
+                                     shouldTilize, shouldUntilize);
+  }
+  // copy the output to the output tensor if it exists
+  if (liveTensors.contains(outputGlobalId)) {
+    ::ttnn::Tensor &outputTensor = *liveTensors.at(outputGlobalId);
+    void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
+    void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
+    std::uint32_t size = result.volume() * result.element_size();
+    std::memcpy(dst, src, size);
+  } else {
+    tensorPool.push_back(result);
+    liveTensors.insert_or_assign(outputGlobalId, &tensorPool.back());
+  }
+}
+
+static void handleToDramMemoryConfigOp(
+    ::ttnn::Device &device, const ::ttnn::Tensor &inputTensor,
+    const ::ttnn::DataType &targetDataTypeTTNN, uint32_t outputGlobalId,
+    std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
+    std::list<::ttnn::Tensor> &tensorPool) {
+  ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::DRAM_MEMORY_CONFIG;
+  bool shouldTilize, shouldUntilize;
+  if (isOnHost(inputTensor)) {
+    ::ttnn::Tensor result = inputTensor;
+    shouldTilize = true;
+    shouldUntilize = false;
+    // device tilize requires BFLOAT16, if not then tilize on host
+    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+      result = ::tilize(result);
+      shouldTilize = false;
+    }
+    result = ::ttnn::to_device(result, &device, memConfig);
+    result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                     shouldUntilize);
+    tensorPool.push_back(result);
+    liveTensors.insert_or_assign(outputGlobalId, &tensorPool.back());
+  } else if (isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = false;
+    ::ttnn::Tensor result = updateLayoutAndDataType(
+        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
+    result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+    tensorPool.push_back(result);
+    liveTensors.insert_or_assign(outputGlobalId, &tensorPool.back());
+  }
+}
+
+static void handleToL1MemoryConfigOp(
+    ::ttnn::Device &device, const ::ttnn::Tensor &inputTensor,
+    const ::tt::target::TensorRef *outputTensorRef,
+    const ::ttnn::DataType &targetDataTypeTTNN,
+    std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
+    std::list<::ttnn::Tensor> &tensorPool) {
+  ::tt::tt_metal::MemoryConfig memConfig =
+      createL1MemoryConfig(outputTensorRef);
+  bool shouldTilize, shouldUntilize;
+  if (isOnHost(inputTensor)) {
+    ::ttnn::Tensor result = inputTensor;
+    // device tilize requires BFLOAT16, if not then tilize on host
+    if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+      result = ::tilize(result);
+      result = ::ttnn::to_device(result, &device, memConfig);
+      shouldTilize = false;
+      shouldUntilize = false;
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       shouldUntilize);
+    } else {
+      shouldTilize = true;
+      shouldUntilize = false;
+      // device tilize op requires height sharded or interleaved tensors
+      result = ::ttnn::to_device(result, &device, std::nullopt);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       shouldUntilize);
+      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+    }
+    tensorPool.push_back(result);
+    liveTensors.insert_or_assign(outputTensorRef->global_id(),
+                                 &tensorPool.back());
+  } else if (isOnDevice(inputTensor)) {
+    shouldTilize = false;
+    shouldUntilize = false;
+    ::ttnn::Tensor result = updateLayoutAndDataType(
+        inputTensor, targetDataTypeTTNN, shouldTilize, shouldUntilize);
+    result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+    tensorPool.push_back(result);
+    liveTensors.insert_or_assign(outputTensorRef->global_id(),
+                                 &tensorPool.back());
+  }
+}
+
 // TODO: right now hardcoding tilize/untilize, should determine with tile shape
 // blocked by issue #272
 static void
 run(::tt::target::ttnn::ToMemoryConfigOp const *op, ::ttnn::Device &device,
     std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
     std::list<::ttnn::Tensor> &tensorPool) {
+
   const ::ttnn::Tensor &inputTensor = *liveTensors.at(op->in0()->global_id());
   TT_FATAL(isOnHost(inputTensor) or isOnDevice(inputTensor),
            "Unsupported storage type {}", inputTensor.storage_type());
+
   const ::tt::target::Dim2d *targetTileShape =
       op->out()->desc()->layout()->memory_desc()->tile_shape();
-
   TT_FATAL(utils::isValidTileShape(targetTileShape),
            "Invalid tile shape ({}, {})", targetTileShape->x(),
            targetTileShape->y());
@@ -122,75 +299,18 @@ run(::tt::target::ttnn::ToMemoryConfigOp const *op, ::ttnn::Device &device,
   // program
   case ::tt::target::MemorySpace::System:
   case ::tt::target::MemorySpace::SystemMMIO: {
-    ::ttnn::Tensor result;
-    if (isOnHost(inputTensor)) {
-      result =
-          updateLayoutAndDataType(inputTensor, targetDataTypeTTNN, false, true);
-    } else if (isOnDevice(inputTensor)) {
-      result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
-                                       false, true);
-    }
-    // copy the output to the output tensor if it exists
-    if (liveTensors.contains(op->out()->global_id())) {
-      ::ttnn::Tensor &outputTensor = *liveTensors.at(op->out()->global_id());
-      void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
-      void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-      std::uint32_t size = result.volume() * result.element_size();
-      std::memcpy(dst, src, size);
-    } else {
-      tensorPool.push_back(result);
-      liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
-    }
+    handleToHostMemoryConfigOp(inputTensor, targetDataTypeTTNN,
+                               op->out()->global_id(), liveTensors, tensorPool);
     break;
   }
   case ::tt::target::MemorySpace::DeviceDRAM: {
-    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::DRAM_MEMORY_CONFIG;
-    if (isOnHost(inputTensor)) {
-      ::ttnn::Tensor result = inputTensor;
-      bool shouldTilize = true;
-      // device tilize requires BFLOAT16, if not then tilize on host
-      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
-        result = ::tilize(result);
-        shouldTilize = false;
-      }
-      result = ::ttnn::to_device(result, &device, memConfig);
-      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
-                                       false);
-      tensorPool.push_back(result);
-      liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
-    } else if (isOnDevice(inputTensor)) {
-      ::ttnn::Tensor result = updateLayoutAndDataType(
-          inputTensor, targetDataTypeTTNN, false, false);
-      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
-      tensorPool.push_back(result);
-      liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
-    }
+    handleToDramMemoryConfigOp(device, inputTensor, targetDataTypeTTNN,
+                               op->out()->global_id(), liveTensors, tensorPool);
     break;
   }
-  // Currently similar to ::tt::target::MemorySpace::DeviceDRAM
-  // But will need it's own code path when we add support for sharding
   case ::tt::target::MemorySpace::DeviceL1: {
-    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::L1_MEMORY_CONFIG;
-    if (isOnHost(inputTensor)) {
-      ::ttnn::Tensor result = inputTensor;
-      bool shouldTilize = true;
-      // device tilize requires BFLOAT16, if not then tilize on host
-      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
-        result = ::tilize(result);
-        shouldTilize = false;
-      }
-      result = ::ttnn::to_device(result, &device, memConfig);
-      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
-                                       false);
-      tensorPool.push_back(result);
-      liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
-    } else if (isOnDevice(inputTensor)) {
-      ::ttnn::Tensor result = updateLayoutAndDataType(
-          inputTensor, targetDataTypeTTNN, false, false);
-      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
-      tensorPool.push_back(result);
-      liveTensors.insert_or_assign(op->out()->global_id(), &tensorPool.back());
-    }
+    handleToL1MemoryConfigOp(device, inputTensor, op->out(), targetDataTypeTTNN,
+                             liveTensors, tensorPool);
     break;
   }
   }
diff --git a/runtime/lib/ttnn/utils.h b/runtime/lib/ttnn/utils.h
index 2cc0ffde81..1e2522804f 100644
--- a/runtime/lib/ttnn/utils.h
+++ b/runtime/lib/ttnn/utils.h
@@ -12,8 +12,7 @@
 namespace tt::runtime::ttnn::utils {
 
 inline bool isValidTileShape(const ::tt::target::Dim2d *shape) {
-  return (shape->x() == 0 and shape->y() == 0) or
-         (shape->x() == 1 and shape->y() == 1) or
+  return (shape->x() == 1 and shape->y() == 1) or
          (shape->x() == 32 and shape->y() == 32);
 }
 
@@ -57,6 +56,25 @@ inline ::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) {
   }
 }
 
+inline ::tt::tt_metal::TensorMemoryLayout
+toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout memLayout) {
+  switch (memLayout) {
+  case ::tt::target::TensorMemoryLayout::Interleaved:
+    return ::tt::tt_metal::TensorMemoryLayout::INTERLEAVED;
+  case ::tt::target::TensorMemoryLayout::SingleBank:
+    return ::tt::tt_metal::TensorMemoryLayout::SINGLE_BANK;
+  case ::tt::target::TensorMemoryLayout::HeightSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED;
+  case ::tt::target::TensorMemoryLayout::WidthSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED;
+  case ::tt::target::TensorMemoryLayout::BlockSharded:
+    return ::tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED;
+
+  default:
+    throw std::runtime_error("Unsupported memory layout");
+  }
+}
+
 inline std::vector<uint32_t>
 toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec) {
   return std::vector<uint32_t>(vec.begin(), vec.end());
diff --git a/test/python/tensor_layout.py b/test/python/tensor_layout.py
index 0710cc4657..00a6b43c07 100644
--- a/test/python/tensor_layout.py
+++ b/test/python/tensor_layout.py
@@ -11,6 +11,16 @@
 tt.register_dialect(ctx)
 
 
+def getTensorMemoryLayout(memorySpace):
+    if (
+        memorySpace == tt.MemorySpace.DeviceL1
+        or memorySpace == tt.MemorySpace.DeviceDRAM
+    ):
+        return tt.TensorMemoryLayout.Interleaved
+    else:
+        return tt.TensorMemoryLayout.NoneLayout
+
+
 def createTensorLayout(
     shape,
     grid,
@@ -23,8 +33,9 @@ def createTensorLayout(
     tensorTy = RankedTensorType.get(
         shape, F32Type.get(ctx), None, Location.unknown(ctx)
     )
+    memoryLayout = getTensorMemoryLayout(memorySpace)
     layout = tt.ir.LayoutAttr.get(
-        ctx, tensorTy, memorySpace, grid, collapseIntervals, oobVal
+        ctx, tensorTy, memorySpace, grid, collapseIntervals, oobVal, memoryLayout
     )
     return RankedTensorType.get(shape, F32Type.get(ctx), layout, Location.unknown(ctx))
 
@@ -47,9 +58,9 @@ def parallelize(tensor, grid, collapseIntervals=[(0, -1)]):
 
 
 t0 = createTensorLayout([2, 3, 64, 128], [2, 4])
-# CHECK: tensor<2x3x64x128xf32, #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<192x32xf32, #tt.memory_space<l1>>>>
+# CHECK: tensor<2x3x64x128xf32, #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<192x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t0)
-# CHECK: #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<6x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<6x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t0, tt.DataType.BFP_BFloat8).wrapped())
 print(parallelize(t0, [3, 2]).wrapped())
 
@@ -58,24 +69,24 @@ def parallelize(tensor, grid, collapseIntervals=[(0, -1)]):
 print(parallelize(t1, [3, 2]).wrapped())
 
 t2 = createTensorLayout([128], [4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (d0), undef, <4>, memref<32xf32, #tt.memory_space<l1>>>>
+# CHECK: tensor<128xf32, #tt.layout<(d0) -> (d0), undef, <4>, memref<32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t2)
-# CHECK: #tt.layout<(d0) -> (d0), undef, <2>, memref<64xf32, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0) -> (d0), undef, <2>, memref<64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t2, [2]).wrapped())
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t2, [1, 2]).wrapped())
 
 t3 = createTensorLayout([128], [1, 4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x32xf32, #tt.memory_space<l1>>>>
+# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t3)
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x4>, memref<1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t3, tt.DataType.BFP_BFloat8).wrapped())
 
 t4 = createTensorLayout([128], [1, 2, 4], collapseIntervals=[(0, -1)])
-# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x32xf32, #tt.memory_space<l1>>>>
+# CHECK: tensor<128xf32, #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x32xf32, #tt.memory_space<l1>>, interleaved>>
 print(t4)
 
-# CHECK: #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0) -> (0, 0, d0), undef, <1x2x4>, memref<1x1x1x!tt.tile<32x32, bfp_bf8>, #tt.memory_space<l1>>, interleaved>
 print(tilize(t4, tt.DataType.BFP_BFloat8).wrapped())
-# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>>
+# CHECK: #tt.layout<(d0) -> (0, d0), undef, <1x2>, memref<1x64xf32, #tt.memory_space<l1>>, interleaved>
 print(parallelize(t4, [1, 2]).wrapped())
diff --git a/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir b/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
index aa1ce84aa9..2335fb0df3 100644
--- a/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
+++ b/test/ttmlir/Dialect/TTIR/split_compound_layout.mlir
@@ -3,21 +3,21 @@
 #dram = #tt.memory_space<dram>
 #l1_ = #tt.memory_space<l1>
 
-// CHECK-DAG: #[[row_major1x1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-// CHECK-DAG: #[[row_major1x1_T:.*]] = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>>
-// CHECK-DAG: #[[row_major2x2:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
-// CHECK-DAG: #[[tile1x1_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>>
-// CHECK-DAG: #[[tile1x1_bf16:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>>
-// CHECK-DAG: #[[tile1x1_f32_dram:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>>
-// CHECK-DAG: #[[tile2x2_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>>
+// CHECK-DAG: #[[row_major1x1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[row_major1x1_T:.*]] = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[row_major2x2:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_bf16:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
+// CHECK-DAG: #[[tile1x1_f32_dram:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+// CHECK-DAG: #[[tile2x2_f32:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
 
-#row_major1x1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-#row_major1x1_T = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>>
-#row_major2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
-#tile1x1_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>>
-#tile1x1_bf16 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>>
-#tile1x1_f32_dram = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>>
-#tile2x2_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>>
+#row_major1x1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+#row_major1x1_T = #tt.layout<(d0, d1) -> (d1, d0), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
+#row_major2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, interleaved>
+#tile1x1_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #l1_>, interleaved>
+#tile1x1_bf16 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #l1_>, interleaved>
+#tile1x1_f32_dram = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, interleaved>
+#tile2x2_f32 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32x32, f32>, #l1_>, interleaved>
 
 func.func @noncompound_linear(%in: tensor<64x128xf32, #row_major1x1>) -> tensor<64x128xf32, #row_major1x1_T> {
     %out = tensor.empty() : tensor<64x128xf32, #row_major1x1_T>
diff --git a/test/ttmlir/Dialect/TTIR/test_allocate.mlir b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
index 72322c472b..a80a8c1c91 100644
--- a/test/ttmlir/Dialect/TTIR/test_allocate.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #l1_ = #tt.memory_space<l1>
-#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, interleaved>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32, #layout>, %arg1: tensor<64x128xf32, #layout>) -> tensor<64x128xf32, #layout> {
     // CHECK: %[[C:.*]] = "ttir.alloc"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
index 0860ff4dab..a2f1a49709 100644
--- a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
@@ -3,7 +3,7 @@
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     %0 = tensor.empty() : tensor<64x128xf32>
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>, interleaved>
     // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]] -> tensor<64x128xf32, #[[LAYOUT_1]]>
     %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
     return %1 : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
index 54a1f1ca7d..1fe8d1ce1e 100644
--- a/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/relu/simple_relu.mlir
@@ -1,13 +1,18 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#any_device = #tt.operand_constraint<any_device>
+#l1 = #tt.memory_space<l1>
+#system = #tt.memory_space<system>
+#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #system>, none_layout>
+#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #system>, none_layout>
+#layout2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1>, interleaved>
 module attributes {} {
-  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  func.func @forward(%arg0: tensor<64x128xf32, #layout>) -> tensor<64x128xf32, #layout1> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
-    %0 = tensor.empty() : tensor<64x128xf32>
+    %0 = tensor.empty() : tensor<64x128xf32, #layout1>
     // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]]
-    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
     // CHECK: "ttnn.close_device"[[C:.*]]
-    return %1 : tensor<64x128xf32>
+    return %1 : tensor<64x128xf32, #layout1>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
index a8616f152b..631bfae03c 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
@@ -3,7 +3,7 @@
 #loc = loc("test_ops.py:17_0_0":0:0)
 module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>, interleaved>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
index adf62660bc..6febf3a655 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
@@ -3,9 +3,9 @@
 #loc = loc("test_ops.py:17_0_0":0:0)
 module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
-    // CHECK: #[[LAYOUT_0:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #dram>>
-    // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>>
+    // CHECK: #[[LAYOUT_0:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>, none_layout>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #dram>, interleaved>
+    // CHECK: #[[LAYOUT_2:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>, interleaved>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
     // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #[[LAYOUT_1]]>
     %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
index f8ee937e74..feb7235390 100644
--- a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>>
+// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
index cfdfde2d14..af8573fe2b 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #dram>, interleaved>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
index e1acc7c802..8e5d2a8ee4 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
@@ -2,7 +2,7 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
-    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #dram>>
+    // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #dram>, interleaved>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
     %0 = tensor.empty() : tensor<64x128xf32>
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index d42e2bda18..8071567b74 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,10 +1,10 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
-#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>>
-#untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>>
+#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, none_layout>
+#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#tilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<1x2x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#untilized2x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<32x64xf32, #l1_>, none_layout>
 func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized2x2> {
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
@@ -22,10 +22,10 @@ func.func @tilize_reblock_2D(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64
 }
 
 
-#untilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<384x128xf32, #l1_>>
-#tilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<12x4x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>>
-#untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>>
+#untilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<384x128xf32, #l1_>, none_layout>
+#tilized4D = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <1x1>, memref<12x4x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#tilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<6x2x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#untilized4D_2x2 = #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x2>, memref<192x64xf32, #l1_>, none_layout>
 func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> tensor<2x3x64x128xf32, #untilized4D_2x2> {
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
   %0 = tensor.empty() : tensor<2x3x64x128xf32, #tilized4D>
@@ -45,10 +45,10 @@ func.func @tilize_reblock_4D(%arg0: tensor<2x3x64x128xf32, #untilized4D>) -> ten
   return %5 : tensor<2x3x64x128xf32, #untilized4D_2x2>
 }
 
-#untilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<96x192xf32, #l1_>>
-#tilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<3x6x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized_big_3x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x2>, memref<1x3x!tt.tile<32 x 32, f32>, #l1_>>
-#tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<96x192xf32, #l1_>, none_layout>
+#tilized_big = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<3x6x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#tilized_big_3x2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x2>, memref<1x3x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
+#tilized_big_3x6 = #tt.layout<(d0, d1) -> (d0, d1), undef, <3x6>, memref<1x1x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
 func.func @tilize_reblock_big(%arg0: tensor<96x192xf32, #untilized_big>) -> tensor<96x192xf32, #untilized_big> {
   // move to tilized 1x1
   // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
index 5c59c9695f..108a7ca6c2 100644
--- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir
+++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -1,8 +1,8 @@
 // RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 #l1_ = #tt.memory_space<l1>
 
-#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>>
-#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<2x8xf32, #l1_>>
+#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>, none_layout>
+#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <2x2>, memref<2x8xf32, #l1_>, none_layout>
 func.func @simple(%arg0: tensor<4x16xf32, #layout>) -> tensor<4x16xf32, #layout1> {
   %0 = tensor.empty() : tensor<4x16xf32, #layout1>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
@@ -10,8 +10,8 @@ func.func @simple(%arg0: tensor<4x16xf32, #layout>) -> tensor<4x16xf32, #layout1
   return %1 : tensor<4x16xf32, #layout1>
 }
 
-#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
-#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>>
+#untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, none_layout>
+#tilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32 x 32, f32>, #l1_>, none_layout>
 func.func @tilize(%arg0: tensor<64x128xf32, #untilized>) -> tensor<64x128xf32, #untilized> {
   %0 = tensor.empty() : tensor<64x128xf32, #tilized>
   // CHECK: %[[C:.*]] = "ttmetal.dispatch"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTNN/sharded/simple_eltwise_sharded.mlir b/test/ttmlir/Silicon/TTNN/sharded/simple_eltwise_sharded.mlir
new file mode 100644
index 0000000000..76a9f0ae74
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/sharded/simple_eltwise_sharded.mlir
@@ -0,0 +1,119 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+#l1_block_sharded = #tt.operand_constraint<l1_block_sharded>
+
+func.func @subtract(%arg0: tensor<1792x256xf32>, %arg1: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
+  %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @div(%arg0: tensor<1792x256xf32>, %arg1: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.div"[[C:.*]]
+  %1 = "ttir.div"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @multiply(%arg0: tensor<1792x256xf32>, %arg1: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+  %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @relu(%arg0: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]]
+  %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @ge(%arg0: tensor<1792x256xf32>, %arg1: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.ge"[[C:.*]]
+  %1 = "ttir.ge"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @reshape(%arg0: tensor<4x2x1792x256xbf16>) -> tensor<2x4x1792x256xbf16> {
+  %0 = tensor.empty() : tensor<2x4x1792x256xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
+  %1 = "ttir.reshape"(%arg0, %0) <{shape = [2: i32, 4: i32, 1792: i32, 256: i32] , operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<4x2x1792x256xbf16>, tensor<2x4x1792x256xbf16>) -> tensor<2x4x1792x256xbf16>
+  return %1 : tensor<2x4x1792x256xbf16>
+}
+
+func.func @squeeze(%arg0: tensor<1x2x1x1792x256xbf16>) -> tensor<1x2x1792x256xbf16> {
+  %0 = tensor.empty() : tensor<1x2x1792x256xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.reshape"[[C:.*]]
+  %1 = "ttir.squeeze"(%arg0, %0) <{dim = 2 : si32, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1x2x1x1792x256xbf16>, tensor<1x2x1792x256xbf16>) -> tensor<1x2x1792x256xbf16>
+  return %1 : tensor<1x2x1792x256xbf16>
+}
+
+func.func @reciprocal(%arg0: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.reciprocal"[[C:.*]]
+  %1 = "ttir.reciprocal"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @sigmoid(%arg0: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.sigmoid"[[C:.*]]
+  %1 = "ttir.sigmoid"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @sqrt(%arg0: tensor<1792x256xf32>) -> tensor<1792x256xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xf32>
+  // CHECK: %[[C:.*]] = "ttnn.sqrt"[[C:.*]]
+  %1 = "ttir.sqrt"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xf32>, tensor<1792x256xf32>) -> tensor<1792x256xf32>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<1792x256xf32>
+}
+
+func.func @softmax(%arg0: tensor<1792x256xbf16>) -> tensor<1792x256xbf16> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %0 = tensor.empty() : tensor<1792x256xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
+  // Check for positive dimension attribute
+  %1 = "ttir.softmax"(%arg0, %0) <{dimension = 1 : si32, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xbf16>, tensor<1792x256xbf16>) -> tensor<1792x256xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.empty"[[C:.*]]
+  %2 = tensor.empty() : tensor<1792x256xbf16>
+  // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
+  // Check for negative dimension attribute
+  %3 = "ttir.softmax"(%1, %2) <{dimension = -1 : si32, operand_constraints = [#l1_block_sharded, #l1_block_sharded]}> : (tensor<1792x256xbf16>, tensor<1792x256xbf16>) -> tensor<1792x256xbf16>
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %3 : tensor<1792x256xbf16>
+}
+
+/////////////////////////////////////////
+// Unsupported eltwise ops with sharding
+//  * Concat: Sharded concat requires ROW MAJOR layout
+/////////////////////////////////////////
diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
index fdee7305f9..2178ffd7f3 100644
--- a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>>
+// CHECK: #[[TILED_LAYOUT:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, interleaved>
 module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
diff --git a/test/ttmlir/Silicon/TTNN/simple_nop.mlir b/test/ttmlir/Silicon/TTNN/simple_nop.mlir
index 7cf9b1bd20..e057d57d28 100644
--- a/test/ttmlir/Silicon/TTNN/simple_nop.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_nop.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttnn-backend-pipeline %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 module @jit_convert_element_type attributes {mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} {

From 4ce9a3cefe4bf6691303e3601bbb2f3eab3e5d7b Mon Sep 17 00:00:00 2001
From: Nick Smith <127986401+nsmithtt@users.noreply.github.com>
Date: Tue, 3 Sep 2024 11:11:50 -0700
Subject: [PATCH 13/16] ttrt deallocate_buffers (#588)

Plumb through deallocate_buffers API to clear metal allocator between
programs.  Previously we were running OOM because of leaked buffers.
Leaked buffers should largely be cleaned up after the allocate=false
support is in #408.

It might still be needed even after #408 lands, because metal internally
uses its allocator for some fast dispatch things, but we can revisit
what policy to adopt.
---
 runtime/include/tt/runtime/detail/ttmetal.h  |  2 ++
 runtime/include/tt/runtime/detail/ttnn.h     |  2 ++
 runtime/include/tt/runtime/runtime.h         |  4 ++++
 runtime/lib/runtime.cpp                      | 15 +++++++++++++++
 runtime/lib/ttmetal/runtime.cpp              |  7 +++++++
 runtime/lib/ttnn/runtime.cpp                 |  4 ++++
 runtime/tools/python/ttrt/common/api.py      |  2 ++
 runtime/tools/python/ttrt/runtime/module.cpp |  3 ++-
 8 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index b79bde0e14..78596c36b2 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -68,6 +68,8 @@ Device openDevice(std::vector<int> const &deviceIds = {0},
 
 void closeDevice(Device device);
 
+void deallocateBuffers(Device device);
+
 Event submit(Device device, Binary executable, std::uint32_t programIndex,
              std::vector<Tensor> const &inputs,
              std::vector<Tensor> const &outputs);
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 487bfdc779..13e7de81a6 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -83,6 +83,8 @@ Device openDevice(std::vector<int> const &deviceIds = {0},
 
 void closeDevice(Device device);
 
+void deallocateBuffers(Device device);
+
 Event submit(Device device, Binary executable, std::uint32_t programIndex,
              std::vector<Tensor> const &inputs,
              std::vector<Tensor> const &outputs);
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 80b0eb783b..f9ab6127db 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -17,6 +17,10 @@ namespace system_desc {
 std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc();
 } // namespace system_desc
 
+namespace detail {
+void deallocateBuffers(Device device);
+}
+
 DeviceRuntime getCurrentRuntime();
 
 std::vector<DeviceRuntime> getAvailableRuntimes();
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index 888f12b466..30604fc187 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -26,6 +26,21 @@ DeviceRuntime globalCurrentRuntime = DeviceRuntime::TTMetal;
 DeviceRuntime globalCurrentRuntime = DeviceRuntime::Disabled;
 #endif
 
+void deallocateBuffers(Device device) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::deallocateBuffers(device);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::deallocateBuffers(device);
+  }
+#endif
+  throw std::runtime_error("runtime is not enabled");
+}
+
 } // namespace detail
 
 DeviceRuntime getCurrentRuntime() {
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index 4842f0b475..ede9b7a71c 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -78,6 +78,13 @@ void closeDevice(Device device) {
   }
 }
 
+void deallocateBuffers(Device deviceHandle) {
+  DeviceMesh &deviceMesh = deviceHandle.as<DeviceMesh>(DeviceRuntime::TTMetal);
+  for (::tt::tt_metal::Device *device : deviceMesh) {
+    device->deallocate_buffers();
+  }
+}
+
 static std::pair<std::shared_ptr<::tt::tt_metal::Buffer>,
                  std::shared_ptr<::tt::tt_metal::Event>>
 prepareInput(::tt::tt_metal::Device *device, MetalTensor const &metalTensor,
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 9cf0575138..e1f786bc49 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -69,6 +69,10 @@ void closeDevice(Device device) {
   ::ttnn::close_device(ttnn_device);
 }
 
+void deallocateBuffers(Device deviceHandle) {
+  deviceHandle.as<::ttnn::Device>(DeviceRuntime::TTNN).deallocate_buffers();
+}
+
 static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
diff --git a/runtime/tools/python/ttrt/common/api.py b/runtime/tools/python/ttrt/common/api.py
index 4b2d3e4185..54f4886498 100644
--- a/runtime/tools/python/ttrt/common/api.py
+++ b/runtime/tools/python/ttrt/common/api.py
@@ -900,6 +900,8 @@ def _execute(binaries):
                             )
                             for tensor in program.output_tensors:
                                 self.logging.debug(f"{tensor}\n")
+
+                            device.deallocate_buffers()
                 finally:
                     ttrt.runtime.close_device(device)
 
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index 453e5164ad..0aa5d84dba 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -13,7 +13,8 @@ PYBIND11_MODULE(_C, m) {
   m.doc() = "ttrt.runtime python extension for interacting with the "
             "Tenstorrent devies";
 
-  py::class_<tt::runtime::Device>(m, "Device");
+  py::class_<tt::runtime::Device>(m, "Device")
+      .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers);
   py::class_<tt::runtime::Event>(m, "Event");
   py::class_<tt::runtime::Tensor>(m, "Tensor");
   py::enum_<::tt::target::DataType>(m, "DataType")

From ada3b6cec35df96f030daeddcee2c80d6513d997 Mon Sep 17 00:00:00 2001
From: Vraj Prajapati <vprajapati@tenstorrent.com>
Date: Tue, 3 Sep 2024 15:28:35 -0500
Subject: [PATCH 14/16] Added Options Argument (#600)

---
 python/Passes.cpp | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/python/Passes.cpp b/python/Passes.cpp
index d599b44e3b..4fa3b23bff 100644
--- a/python/Passes.cpp
+++ b/python/Passes.cpp
@@ -17,31 +17,33 @@ void populatePassesModule(py::module &m) {
   mlir::tt::registerAllPasses();
   mlir::registerAllTranslations();
 
-  m.def("ttir_to_ttnn_backend_pipeline", [](MlirModule module) {
-    mlir::Operation *moduleOp = unwrap(mlirModuleGetOperation(module));
-    mlir::PassManager pm(moduleOp->getName());
-
-    mlir::DialectRegistry registry;
-    mlir::tt::registerAllDialects(registry);
-    mlir::MLIRContext *ctx = unwrap(mlirModuleGetContext(module));
-    ctx->appendDialectRegistry(registry);
+  m.def(
+      "ttir_to_ttnn_backend_pipeline",
+      [](MlirModule module, std::string options = "") {
+        mlir::Operation *moduleOp = unwrap(mlirModuleGetOperation(module));
+        mlir::PassManager pm(moduleOp->getName());
 
-    const auto pipeline =
-        mlir::PassPipelineInfo::lookup("ttir-to-ttnn-backend-pipeline");
+        mlir::DialectRegistry registry;
+        mlir::tt::registerAllDialects(registry);
+        mlir::MLIRContext *ctx = unwrap(mlirModuleGetContext(module));
+        ctx->appendDialectRegistry(registry);
 
-    std::string options = "";
+        const auto pipeline =
+            mlir::PassPipelineInfo::lookup("ttir-to-ttnn-backend-pipeline");
 
-    mlir::function_ref<mlir::LogicalResult(const llvm::Twine &)> err_handler =
-        [](const llvm::Twine &loc) { return mlir::failure(); };
+        mlir::function_ref<mlir::LogicalResult(const llvm::Twine &)>
+            err_handler =
+                [](const llvm::Twine &loc) { return mlir::failure(); };
 
-    if (mlir::failed(pipeline->addToPipeline(pm, options, err_handler))) {
-      throw std::runtime_error("Failed to add pipeline to pass manager");
-    }
+        if (mlir::failed(pipeline->addToPipeline(pm, options, err_handler))) {
+          throw std::runtime_error("Failed to add pipeline to pass manager");
+        }
 
-    if (mlir::failed(pm.run(moduleOp))) {
-      throw std::runtime_error("Failed to run pass manager");
-    }
-  });
+        if (mlir::failed(pm.run(moduleOp))) {
+          throw std::runtime_error("Failed to run pass manager");
+        }
+      },
+      py::arg("module"), py::arg("options") = "");
 
   py::class_<std::shared_ptr<void>>(m, "SharedVoidPtr")
       .def(py::init<>())

From a75fcf32aa142d3a3dbf86d307027a0706e9c32b Mon Sep 17 00:00:00 2001
From: Kyle Mabee <118925087+kmabeeTT@users.noreply.github.com>
Date: Tue, 3 Sep 2024 22:16:40 -0400
Subject: [PATCH 15/16] Update RUN cmds for 3x TTMetal tests to include
 "ttmlir-translate --ttmetal-to-flatbuffer"  (#603)

- Needed after 5d60c17 today otherwise tests don't run on CI
- fix linking of ttmlir-translate for macos
---
 test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir |  5 ++++-
 test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir  |  5 ++++-
 test/ttmlir/Silicon/TTMetal/to_layout.mlir      |  5 ++++-
 tools/ttmlir-translate/CMakeLists.txt           | 11 +++++++----
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
index 1162bfc5c3..337f719031 100644
--- a/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTMetal/simple_eltwise.mlir
@@ -1,4 +1,7 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline  %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-to-ttmetal-backend-pipeline %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 
 func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
diff --git a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
index 8071567b74..f1a7bfa8c8 100644
--- a/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
+++ b/test/ttmlir/Silicon/TTMetal/tiled_reblock.mlir
@@ -1,4 +1,7 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
 #l1_ = #tt.memory_space<l1>
 
 #untilized = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>, none_layout>
diff --git a/test/ttmlir/Silicon/TTMetal/to_layout.mlir b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
index 108a7ca6c2..38b218c5f0 100644
--- a/test/ttmlir/Silicon/TTMetal/to_layout.mlir
+++ b/test/ttmlir/Silicon/TTMetal/to_layout.mlir
@@ -1,4 +1,7 @@
-// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttmetal-to-flatbuffer %t.mlir > %t.ttm
+
 #l1_ = #tt.memory_space<l1>
 
 #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<4x16xf32, #l1_>, none_layout>
diff --git a/tools/ttmlir-translate/CMakeLists.txt b/tools/ttmlir-translate/CMakeLists.txt
index e57bdbbd1d..0b588d322b 100644
--- a/tools/ttmlir-translate/CMakeLists.txt
+++ b/tools/ttmlir-translate/CMakeLists.txt
@@ -1,9 +1,12 @@
-get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
-get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-set(LIBS ${translation_libs} ${dialect_libs} TTMLIRTTNNToEmitC TTNNTargetFlatbuffer TTMetalTargetFlatbuffer)
 add_llvm_executable(ttmlir-translate ttmlir-translate.cpp)
 
 llvm_update_compile_flags(ttmlir-translate)
-target_link_libraries(ttmlir-translate PRIVATE ${LIBS})
+target_link_libraries(ttmlir-translate PRIVATE
+  TTNNTargetFlatbuffer
+  TTMetalTargetFlatbuffer
+  MLIRTTNNTransforms
+  TTMLIRTTNNToEmitC
+  TTMLIRTTKernelToEmitC
+)
 
 mlir_check_link_libraries(ttmlir-translate)

From 95b2a90202150664e6bc964711f082dc0b14a3ee Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <157365107+sdjordjevicTT@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:43:06 +0200
Subject: [PATCH 16/16] Renaming PyBuda\Buda occurances to Forge in TT-MLIR
 repo (#595)

---
 docs/src/build.md                                  |  4 ++--
 docs/src/overview.md                               |  2 +-
 docs/src/specs/runtime-stitching.md                | 14 +++++++-------
 .../ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir |  2 +-
 .../TTNN/multiple_add_with_loc_grid_override.mlir  |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/src/build.md b/docs/src/build.md
index 875ab404f6..339b187d2f 100644
--- a/docs/src/build.md
+++ b/docs/src/build.md
@@ -211,8 +211,8 @@ If you get the following error, it means you need to install clang which you can
 ### `sfpi`, `trisc`, `ncrisc` build failure
 
 ```
-pybuda/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 1: version: not found
-pybuda/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 2: oid: not found
+tt-forge-fe/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 1: version: not found
+tt-forge-fe/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 2: oid: not found
 size: '1961632': No such file
 size: '1961632': No such file
 size: '1961632': No such file
diff --git a/docs/src/overview.md b/docs/src/overview.md
index 7f102e2735..42cbb50862 100644
--- a/docs/src/overview.md
+++ b/docs/src/overview.md
@@ -202,7 +202,7 @@ level of complexity downwards for the bottom, we will define a very
 aggressive TTNN backend for the MVP.
 Desired Optimization List:
 
--   BUDA (frontend)
+-   Forge-FE (frontend)
 
     -   Graph Optimizations, Constant Folding, Operation Fusion
 
diff --git a/docs/src/specs/runtime-stitching.md b/docs/src/specs/runtime-stitching.md
index 390fa5bc39..c396352cab 100644
--- a/docs/src/specs/runtime-stitching.md
+++ b/docs/src/specs/runtime-stitching.md
@@ -13,8 +13,8 @@ between the compiler and the runtime.
 
 ### Simple Example
 ```
-mod_a = pybuda.compile(PyTorch_module_a)
-mod_b = pybuda.compile(PyTorch_module_b)
+mod_a = forge.compile(PyTorch_module_a)
+mod_b = forge.compile(PyTorch_module_b)
 
 for i in range(10):
     outs_a = mod_a(ins_a)
@@ -26,15 +26,15 @@ for i in range(10):
 `mod_a` it should be completely unaware that `mod_b` will take place and vice-versa.
 In order to achieve this we propose a new runtime concept called stitching:
 
-- pybuda invokes compile step for `mod_a`, tt-mlir compiler determines where the
+- forge invokes compile step for `mod_a`, tt-mlir compiler determines where the
   inputs (`ins_a`) should live, host, device dram, device l1. tt-mlir returns
-  metadata to pybuda describing where it wants the tensors to reside before invoking
+  metadata to forge describing where it wants the tensors to reside before invoking
   flatbuffer submission.
-- pybuda invokes compile step for `mod_b`, same happens as bullet 1
-- `mod_a` is invoked at runtime, pybuda runtime needs to inspect the compiler metadata
+- forge invokes compile step for `mod_b`, same happens as bullet 1
+- `mod_a` is invoked at runtime, forge runtime needs to inspect the compiler metadata
   to determine where the tensors should live.  Runtime manually invokes a new data
   copy command to get the tenors to the correct memory space / correct memory address.
-- pybuda runtime invokes `mod_a` program submit
+- forge runtime invokes `mod_a` program submit
 - `mod_b` is invoked at runtime, this time it might be that the compiler left
   the tensor outputs in L1, so no data copy is needed to start running `mod_b`
   since the inputs are already in the correct location.
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
index 631bfae03c..9ef94274f6 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #loc = loc("test_ops.py:17_0_0":0:0)
-module @pybuda_graph attributes {} {
+module attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #dram>, interleaved>
     %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
index 6febf3a655..1a2ee0c12b 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="override-grid-sizes=add_1_0=4x4,add_2_0=4x4" %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #loc = loc("test_ops.py:17_0_0":0:0)
-module @pybuda_graph attributes {} {
+module attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #[[LAYOUT_0:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>, none_layout>
     // CHECK: #[[LAYOUT_1:.*]] = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #dram>, interleaved>