Merge branch 'main' into odjuricic/grid-analysis

tenstorrent · Sep 4, 2024 · cc2d180 · cc2d180
2 parents 83bb3a6 + 95b2a90
commit cc2d180
Show file tree

Hide file tree

Showing 98 changed files with 2,566 additions and 1,150 deletions.
diff --git a/docs/src/build.md b/docs/src/build.md
@@ -211,8 +211,8 @@ If you get the following error, it means you need to install clang which you can
 ### `sfpi`, `trisc`, `ncrisc` build failure
 
 ```
-pybuda/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 1: version: not found
-pybuda/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 2: oid: not found
+tt-forge-fe/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 1: version: not found
+tt-forge-fe/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/tt_metal/third_party/sfpi/compiler/bin/riscv32-unknown-elf-g++: 2: oid: not found
 size: '1961632': No such file
 size: '1961632': No such file
 size: '1961632': No such file

diff --git a/docs/src/overview.md b/docs/src/overview.md
@@ -202,7 +202,7 @@ level of complexity downwards for the bottom, we will define a very
 aggressive TTNN backend for the MVP.
 Desired Optimization List:
 
--   BUDA (frontend)
+-   Forge-FE (frontend)
 
     -   Graph Optimizations, Constant Folding, Operation Fusion
 

diff --git a/docs/src/specs/runtime-stitching.md b/docs/src/specs/runtime-stitching.md
@@ -13,8 +13,8 @@ between the compiler and the runtime.
 
 ### Simple Example
 ```
-mod_a = pybuda.compile(PyTorch_module_a)
-mod_b = pybuda.compile(PyTorch_module_b)
+mod_a = forge.compile(PyTorch_module_a)
+mod_b = forge.compile(PyTorch_module_b)
 
 for i in range(10):
     outs_a = mod_a(ins_a)
@@ -26,15 +26,15 @@ for i in range(10):
 `mod_a` it should be completely unaware that `mod_b` will take place and vice-versa.
 In order to achieve this we propose a new runtime concept called stitching:
 
-- pybuda invokes compile step for `mod_a`, tt-mlir compiler determines where the
+- forge invokes compile step for `mod_a`, tt-mlir compiler determines where the
   inputs (`ins_a`) should live, host, device dram, device l1. tt-mlir returns
-  metadata to pybuda describing where it wants the tensors to reside before invoking
+  metadata to forge describing where it wants the tensors to reside before invoking
   flatbuffer submission.
-- pybuda invokes compile step for `mod_b`, same happens as bullet 1
-- `mod_a` is invoked at runtime, pybuda runtime needs to inspect the compiler metadata
+- forge invokes compile step for `mod_b`, same happens as bullet 1
+- `mod_a` is invoked at runtime, forge runtime needs to inspect the compiler metadata
   to determine where the tensors should live.  Runtime manually invokes a new data
   copy command to get the tenors to the correct memory space / correct memory address.
-- pybuda runtime invokes `mod_a` program submit
+- forge runtime invokes `mod_a` program submit
 - `mod_b` is invoked at runtime, this time it might be that the compiler left
   the tensor outputs in L1, so no data copy is needed to start running `mod_b`
   since the inputs are already in the correct location.

diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md
@@ -63,6 +63,8 @@ ttrt query --save-artifacts
 4. Use ttmlir-opt tool in compiler to feed system descriptor. See the [ttmlir-opt](./ttmlir-opt.md) documentation for more information on how to generate .mlir files.
 ```bash
 ./build/bin/ttmlir-opt --ttir-load-system-desc="path=/path/to/system_desc.ttsys" --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir
+or (pip path directly into ttir-to-ttnn-backend-pipeline)
+./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=/path/to/system_desc.ttsys" test/ttmlir/Dialect/TTNN/simple_subtract.mlir -o ttnn.mlir
 ```
 5. Use ttmlir-translate tool in compiler to generate the flatbuffer executable. See the [ttmlir-translate](./ttmlir-translate.md) documentation for more information on how to generate flatbuffer files.
 ```bash

diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
@@ -49,18 +49,19 @@ MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTSystemDescAttrGet(
     MlirAttribute *chipCoords, size_t chipCoordsSize,
     MlirAttribute *chipChannels, size_t chipChannelsSize);
 
-MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx,
-                                                       MlirAffineMap linear,
-                                                       unsigned oobVal,
-                                                       MlirAttribute grid,
-                                                       MlirType memref);
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear, unsigned oobVal,
+                      MlirAttribute grid, MlirType memref, unsigned memLayout);
 
 MLIR_CAPI_EXPORTED MlirAttribute
 ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace);
 
 MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx,
                                                        uint32_t oobVal);
 
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTTensorMemoryLayoutAttrGet(MlirContext ctx, uint32_t memLayout);
+
 MLIR_CAPI_EXPORTED MlirAttribute
 ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType);
 

diff --git a/include/ttmlir/Conversion/Passes.h b/include/ttmlir/Conversion/Passes.h
@@ -9,6 +9,7 @@
 #include "ttmlir/Conversion/StableHLOToTTIR/StableHLOToTTIR.h"
 #endif
 #include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
+#include "ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h"
 #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
 #include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIR.h"

diff --git a/include/ttmlir/Conversion/Passes.td b/include/ttmlir/Conversion/Passes.td
@@ -34,4 +34,10 @@ def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::ModuleOp"> {
   let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::tt::ttnn::TTNNDialect"];
 }
 
+def ConvertTTKernelToEmitC : Pass<"convert-ttkernel-to-emitc", "::func::FuncOp"> {
+  let summary = "Convert TTKernel dialect to EmitC dialect.";
+  let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::func::FuncDialect",
+                           "mlir::tt::ttkernel::TTKernelDialect"];
+}
+
 #endif // TTMLIR_CONVERSION_PASSES
diff --git a/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h b/include/ttmlir/Conversion/TTKernelToEmitC/TTKernelToEmitC.h
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H
+#define TTMLIR_CONVERSION_TTKERNELTOEMITC_TTKERNELTOEMITC_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
+#include "ttmlir/Dialect/TTMetal/IR/TTMetalOps.h"
+#include <llvm/ADT/SmallVector.h>
+
+namespace mlir::tt {
+#define GEN_PASS_DECL_CONVERTTTKERNELTOEMITC
+#include "ttmlir/Conversion/Passes.h.inc"
+
+// Runs a conversion pass to EmitC dialect on a func op containing given
+// region's body. Also, it adds boilerplate code such as includes and namespace
+// declarations.
+LogicalResult
+convertTTKernelRegionToEmitC(OpBuilder &builder, Region *region,
+                             const ttkernel::ThreadTypeAttr &threadType);
+
+// Converts given region to EmitC dialect and translates it to C++ code.
+LogicalResult
+emitDispatchOpRegionAsCpp(Region *region, std::string &regionCpp,
+                          const ttkernel::ThreadTypeAttr &threadType);
+
+// Converts dispatch op's regions to C++ code.
+LogicalResult
+emitDispatchOpRegionsAsCpp(ttmetal::DispatchOp dispatchOp,
+                           llvm::SmallVector<std::string> &cppStrings);
+
+} // namespace mlir::tt
+
+#endif
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
@@ -72,6 +72,26 @@ def TT_MemorySpace : I32EnumAttr<"MemorySpace", "TT MemorySpace",
   let cppNamespace = "::mlir::tt";
 }
 
+def TT_NoneLayout : I32EnumAttrCase<"NoneLayout", 0, "none_layout">;
+def TT_Interleaved : I32EnumAttrCase<"Interleaved", 1, "interleaved">;
+def TT_SingleBank : I32EnumAttrCase<"SingleBank", 2, "single_bank">;
+def TT_HeightSharded : I32EnumAttrCase<"HeightSharded", 3, "height_sharded">;
+def TT_WidthSharded : I32EnumAttrCase<"WidthSharded", 4, "width_sharded">;
+def TT_BlockSharded : I32EnumAttrCase<"BlockSharded", 5, "block_sharded">;
+
+def TT_TensorMemoryLayout : I32EnumAttr<"TensorMemoryLayout", "TT TensorMemoryLayout",
+                           [
+                            TT_NoneLayout,
+                            TT_Interleaved,
+                            TT_SingleBank,
+                            TT_HeightSharded,
+                            TT_WidthSharded,
+                            TT_BlockSharded,
+                           ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::tt";
+}
+
 def TT_Parallel : I32EnumAttrCase<"Parallel", 0, "parallel">;
 def TT_Systolic : I32EnumAttrCase<"Systolic", 1, "systolic">;
 def TT_Broadcast : I32EnumAttrCase<"Broadcast", 2, "broadcast">;
@@ -109,20 +129,35 @@ def TT_OperandConstraintDRAM : I32BitEnumAttrCaseBit<"DRAM", 1, "dram">;
 def TT_OperandConstraintL1 : I32BitEnumAttrCaseBit<"L1", 2, "l1">;
 def TT_OperandConstraintScalar : I32BitEnumAttrCaseBit<"Scalar", 3, "scalar">;
 def TT_OperandConstraintTile : I32BitEnumAttrCaseBit<"Tile", 4, "tile">;
-def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile], "any">;
-def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile], "any_device">;
-def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile], "any_device_tile">;
-
+def TT_OperandConstraintNoneLayout : I32BitEnumAttrCaseBit<"NoneLayout", 5, "none_layout">;
+def TT_OperandConstraintInterleaved : I32BitEnumAttrCaseBit<"Interleaved", 6, "interleaved">;
+def TT_OperandConstraintSingleBank : I32BitEnumAttrCaseBit<"SingleBank", 7, "single_bank">;
+def TT_OperandConstraintHeightSharded : I32BitEnumAttrCaseBit<"HeightSharded", 8, "height_sharded">;
+def TT_OperandConstraintWidthSharded : I32BitEnumAttrCaseBit<"WidthSharded", 9, "width_sharded">;
+def TT_OperandConstraintBlockSharded : I32BitEnumAttrCaseBit<"BlockSharded", 10, "block_sharded">;
+def TT_OperandConstraintAnyLayout : I32BitEnumAttrCaseGroup<"AnyLayout", [TT_OperandConstraintNoneLayout, TT_OperandConstraintInterleaved, TT_OperandConstraintSingleBank, TT_OperandConstraintHeightSharded, TT_OperandConstraintWidthSharded, TT_OperandConstraintBlockSharded], "any_layout">;
+def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any">;
+def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device">;
+def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device_tile">;
+def TT_OperandConstraintL1BlockSharded : I32BitEnumAttrCaseGroup<"L1BlockSharded", [TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintBlockSharded], "l1_block_sharded">;
 def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Constraints",
                            [
                             TT_OperandConstraintSystem,
                             TT_OperandConstraintDRAM,
                             TT_OperandConstraintL1,
                             TT_OperandConstraintScalar,
                             TT_OperandConstraintTile,
+                            TT_OperandConstraintNoneLayout,
+                            TT_OperandConstraintInterleaved,
+                            TT_OperandConstraintSingleBank,
+                            TT_OperandConstraintHeightSharded,
+                            TT_OperandConstraintWidthSharded,
+                            TT_OperandConstraintBlockSharded,
+                            TT_OperandConstraintAnyLayout,
                             TT_OperandConstraintAny,
                             TT_OperandConstraintAnyDevice,
                             TT_OperandConstraintAnyDeviceTile,
+                            TT_OperandConstraintL1BlockSharded,
                            ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::tt";

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
@@ -22,6 +22,16 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) {
          memorySpace == MemorySpace::DeviceL1;
 }
 
+inline bool isL1MemorySpace(MemorySpace memorySpace) {
+  return memorySpace == MemorySpace::DeviceL1;
+}
+
+inline bool isShardedMemoryLayout(TensorMemoryLayout layout) {
+  return layout == TensorMemoryLayout::HeightSharded ||
+         layout == TensorMemoryLayout::WidthSharded ||
+         layout == TensorMemoryLayout::BlockSharded;
+}
+
 inline void printDimensionList(::mlir::AsmPrinter &printer,
                                ::llvm::ArrayRef<int64_t> shape) {
   printer.printDimensionList(shape);

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -107,6 +107,7 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {
                     "unsigned":$l1UnreservedBase,
                     "unsigned":$eriscL1UnreservedBase,
                     "unsigned":$dramUnreservedBase,
+                    "unsigned":$dramUnreservedEnd,
                     "ChipPhysicalCoresAttr":$chipPhysicalCores,
                     ArrayRefParameter<"DataTypeAttr">:$supportedDataTypes,
                     ArrayRefParameter<"TileSizeAttr">:$supportedTileSizes);
@@ -121,13 +122,14 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {
                              `l1_unreserved_base` `=` $l1UnreservedBase `,`
                              `erisc_l1_unreserved_base` `=` $eriscL1UnreservedBase `,`
                              `dram_unreserved_base` `=` $dramUnreservedBase `,`
+                             `dram_unreserved_end` `=` $dramUnreservedEnd `,`
                              `physical_cores` `=` $chipPhysicalCores `,`
                              `supported_data_types` `=` `[` $supportedDataTypes `]` `,`
                              `supported_tile_sizes` `=` `[` $supportedTileSizes `]` `}`}];
 
   let extraClassDeclaration = [{
     unsigned getUsableL1Size() const { return getL1Size() - getL1UnreservedBase(); }
-    unsigned getUsableDramChannelSize() const { return getDramChannelSize() - getDramUnreservedBase(); }
+    unsigned getUsableDramChannelSize() const { return getDramUnreservedEnd() - getDramUnreservedBase(); }
   }];
 }
 
@@ -243,8 +245,9 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
   let parameters = (ins AttrParameter<"AffineMap", "An affine map that defines how the logical tensor dimensions map to a grid shape.">:$linear,
                         AttrParameter<"OOBVal", "A tracked out of bounds value that fills padding space.">:$oob_val,
                         AttrParameter<"GridAttr", "The grid shape that this tensor is divided onto.">:$grid,
-                        AttrParameter<"MemRefType", "A memref that describes the physical footprint allocation of the shard. It must also have a shape with rank equal to grid.">:$memref);
-  let assemblyFormat = "`<` $linear`,` $oob_val`,` $grid`,` $memref `>`";
+                        AttrParameter<"MemRefType", "A memref that describes the physical footprint allocation of the shard. It must also have a shape with rank equal to grid.">:$memref,
+                        AttrParameter<"TensorMemoryLayout", "The layout of the tensor in memory.">:$mem_layout);
+  let assemblyFormat = "`<` $linear`,` $oob_val`,` $grid`,` $memref`,` $mem_layout `>`";
 
   let extraClassDeclaration = [{
       static LayoutAttr get(::mlir::MLIRContext *context,
@@ -253,29 +256,33 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
                             MemorySpace memorySpace = MemorySpace::System,
                             GridAttr grid = {},
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
-                            OOBVal oobVal = OOBVal::Undef);
+                            OOBVal oobVal = OOBVal::Undef,
+                            TensorMemoryLayout memLayout = TensorMemoryLayout::NoneLayout);
       static LayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace = MemorySpace::System,
                             GridAttr grid = {},
                             ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
-                            OOBVal oobVal = OOBVal::Undef);
+                            OOBVal oobVal = OOBVal::Undef,
+                            TensorMemoryLayout memLayout = TensorMemoryLayout::NoneLayout);
       static LayoutAttr get(::mlir::MLIRContext *context,
                             RankedTensorType ty,
                             MemorySpace memorySpace,
                             GridAttr grid,
-                            Type elementType);
+                            Type elementType,
+                            TensorMemoryLayout memLayout);
       LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
       LayoutAttr withGrid(::mlir::MLIRContext *context,
                           RankedTensorType ty,
                           GridAttr grid,
                           ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
       LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
       LayoutAttr withMemorySpace(::mlir::MLIRContext *context, MemorySpace memorySpace);
-
+      LayoutAttr withMemoryLayout(::mlir::MLIRContext *context, TensorMemoryLayout memLayout);
       MemorySpace getMemorySpace() const;
       bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); }
       bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); }
+      bool hasShardedTensorMemoryLayout() const;
       bool isTiled() const;
       Type getElementType() const;
       Type getScalarElementType() const;
@@ -335,6 +342,10 @@ def TT_MemorySpaceAttr : EnumAttr<TT_Dialect, TT_MemorySpace, "memory_space"> {
   let assemblyFormat = "`<` $value `>`";
 }
 
+def TT_TensorMemoryLayoutAttr : EnumAttr<TT_Dialect, TT_TensorMemoryLayout, "tensor_memory_layout"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 def TT_OOBValAttr : EnumAttr<TT_Dialect, TT_OOBVal, "oob_val"> {
   let assemblyFormat = "`<` $value `>`";
 }