Merge branch 'main' into vwells/llvm_helper_transform

tenstorrent · Dec 17, 2024 · 6348c92 · 6348c92
2 parents f7147fb + 8c37b9d
commit 6348c92
Show file tree

Hide file tree

Showing 102 changed files with 939 additions and 507 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -28,5 +28,5 @@
 /test/ttmlir/Dialect/TTNN/optimizer/ @nobradovictt @odjuricicTT
 /test/ttmlir/Silicon/TTNN/optimizer/ @nobradovictt @odjuricicTT
 /test/unittests/Optimizer @nobradovictt @odjuricicTT
-/tools/explorer/ @odjuricicTT @nobradovictt @vprajapati-tt
 /tools/ @svuckovicTT @mtopalovicTT
+/tools/explorer/ @odjuricicTT @nobradovictt @vprajapati-tt
diff --git a/docs/src/adding-an-op.md b/docs/src/adding-an-op.md
@@ -53,18 +53,13 @@ There are many things to break down here, starting from the top:
   be critical for modeling buffer allocation / lifetimes. Note the 3rd argument
   `AnyRankedTensor:$output`.
 - Next we have a list of `arguments`.  These arguments consist of a mixture of
-  `Type`s (i.e. `AnyRankedTensor`) and `Attribute`s (i.e. `TT_OperandConstraintArrayAttr`).
+  `Type`s (i.e. `AnyRankedTensor`) and `Attribute`s.
   [Read more about Types & Attributes
   here](https://mlir.llvm.org/docs/DefiningDialects/AttributesAndTypes/#attributes).
     - `AnyRankedTensor` is part of a tablegen standard library which type
       aliases to MLIR's builtin Tensor type, with the added constraint that the
       tensor has a static rank.  As much as possible we want to use the builtin
       types and infrastructure provided by MLIR.
-    - `TT_OperandConstraintArrayAttr` is a custom attribute that we have defined
-      in the [`TT`](./autogen/md/Dialect/TTDialect.md) dialect.  This attribute is
-      used to specify constraints on the
-      operands of the operation.  For example, the `TTIR_MatmulOp` requires that
-      the input tensors be in tile layout, this attribute captures this constraint.
 - Next we have a list of `results` in this case just 1, which aliases the
   `output` tensor.  One drawback of DPS is that the result tensor and the
   output tensor will appear to have different SSA names in the IR, but they

diff --git a/docs/src/overview.md b/docs/src/overview.md
@@ -104,8 +104,7 @@ module attributes {tt.system_desc = #tt.system_desc<[<#tt.arch<wormhole_b0>, #tt
         defines the type of result
 
     -   Quotes are added around ttir.multiply since it's part of a
-        custom dialect, and more custom assembly instructions are
-        applied to specify operand_constraints.
+        custom dialect.
 
     -   Operations typically have operands (arguments) and results which
         are highlighted with %, these results and operands help to show

diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
@@ -69,13 +69,6 @@ ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType);
 MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTIteratorTypeArrayAttrGet(
     MlirContext ctx, uint32_t *iteratorTypes, size_t iteratorTypesSize);
 
-MLIR_CAPI_EXPORTED MlirAttribute
-ttmlirTTOperandConstraintAttrGet(MlirContext ctx, uint32_t OperandConstraint);
-
-MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOperandConstraintArrayAttrGet(
-    MlirContext ctx, uint32_t *OperandConstraints,
-    size_t OperandConstraintsSize);
-
 MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTTileSizeAttrGet(MlirContext ctx,
                                                          int64_t y, int64_t x);
 

diff --git a/include/ttmlir/Bindings/Python/TTMLIRModule.h b/include/ttmlir/Bindings/Python/TTMLIRModule.h
@@ -62,6 +62,7 @@ void populateTTNNModule(py::module &m);
 void populateOverridesModule(py::module &m);
 void populateOptimizerOverridesModule(py::module &m);
 void populatePassesModule(py::module &m);
+void populateUtilModule(py::module &m);
 } // namespace mlir::ttmlir::python
 
 #endif // TTMLIR_BINDINGS_PYTHON_TTMLIRMODULE_H
diff --git a/include/ttmlir/Dialect/TT/IR/TTOps.h b/include/ttmlir/Dialect/TT/IR/TTOps.h
@@ -9,6 +9,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"

diff --git a/include/ttmlir/Dialect/TT/IR/TTOps.td b/include/ttmlir/Dialect/TT/IR/TTOps.td
@@ -6,5 +6,30 @@
 #define TTMLIR_TTMLIR_TTOPS_TD
 
 include "ttmlir/Dialect/TT/IR/TTOpsTypes.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/CommonTypeConstraints.td"
+
+def TT_GetTupleElementOp: TT_Op<"get_tuple_element", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+    let summary = "GetTupleElement operation";
+    let description = [{
+      Extracts element at `index` position of the `operand` tuple and produces a `result`.
+
+      Example:
+      ```mlir
+      %result = tt.get_tuple_element %operand[0] : (tuple<tensor<32x32xbf16>, tuple<tensor<1x32xf32>>>) -> tensor<32x32xbf16>
+      ```
+    }];
+
+    let arguments = (ins TT_Tuple:$operand,
+                         ConfinedAttr<I32Attr, [IntNonNegative]>:$index
+    );
+
+    let results = (outs TT_TupleReturnType:$result);
+
+    let assemblyFormat = [{
+      $operand `[` $index `]` attr-dict `:` functional-type(operands, results)
+    }];
+}
 
 #endif
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td
@@ -126,47 +126,6 @@ def TT_OOBVal : I32EnumAttr<"OOBVal", "TT OOBVal",
   let cppNamespace = "::mlir::tt";
 }
 
-def TT_OperandConstraintSystem : I32BitEnumAttrCaseBit<"System", 0, "system">;
-def TT_OperandConstraintDRAM : I32BitEnumAttrCaseBit<"DRAM", 1, "dram">;
-def TT_OperandConstraintL1 : I32BitEnumAttrCaseBit<"L1", 2, "l1">;
-def TT_OperandConstraintScalar : I32BitEnumAttrCaseBit<"Scalar", 3, "scalar">;
-def TT_OperandConstraintTile : I32BitEnumAttrCaseBit<"Tile", 4, "tile">;
-def TT_OperandConstraintNone : I32BitEnumAttrCaseBit<"None", 5, "none">;
-def TT_OperandConstraintInterleaved : I32BitEnumAttrCaseBit<"Interleaved", 6, "interleaved">;
-def TT_OperandConstraintSingleBank : I32BitEnumAttrCaseBit<"SingleBank", 7, "single_bank">;
-def TT_OperandConstraintHeightSharded : I32BitEnumAttrCaseBit<"HeightSharded", 8, "height_sharded">;
-def TT_OperandConstraintWidthSharded : I32BitEnumAttrCaseBit<"WidthSharded", 9, "width_sharded">;
-def TT_OperandConstraintBlockSharded : I32BitEnumAttrCaseBit<"BlockSharded", 10, "block_sharded">;
-def TT_OperandConstraintSystemScalar : I32BitEnumAttrCaseGroup<"SystemScalar", [TT_OperandConstraintSystem, TT_OperandConstraintScalar], "system_scalar">;
-def TT_OperandConstraintAnyLayout : I32BitEnumAttrCaseGroup<"AnyLayout", [TT_OperandConstraintNone, TT_OperandConstraintInterleaved, TT_OperandConstraintSingleBank, TT_OperandConstraintHeightSharded, TT_OperandConstraintWidthSharded, TT_OperandConstraintBlockSharded], "any_layout">;
-def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any">;
-def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device">;
-def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device_tile">;
-def TT_OperandConstraintL1BlockSharded : I32BitEnumAttrCaseGroup<"L1BlockSharded", [TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintBlockSharded], "l1_block_sharded">;
-def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Constraints",
-                           [
-                            TT_OperandConstraintSystem,
-                            TT_OperandConstraintDRAM,
-                            TT_OperandConstraintL1,
-                            TT_OperandConstraintScalar,
-                            TT_OperandConstraintTile,
-                            TT_OperandConstraintNone,
-                            TT_OperandConstraintInterleaved,
-                            TT_OperandConstraintSingleBank,
-                            TT_OperandConstraintHeightSharded,
-                            TT_OperandConstraintWidthSharded,
-                            TT_OperandConstraintBlockSharded,
-                            TT_OperandConstraintSystemScalar,
-                            TT_OperandConstraintAnyLayout,
-                            TT_OperandConstraintAny,
-                            TT_OperandConstraintAnyDevice,
-                            TT_OperandConstraintAnyDeviceTile,
-                            TT_OperandConstraintL1BlockSharded,
-                           ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::tt";
-}
-
 def TT_ChipCapabilityPCIE : I32BitEnumAttrCaseBit<"PCIE", 0, "pcie">;
 def TT_ChipCapabilityHostMMIO : I32BitEnumAttrCaseBit<"HostMMIO", 1, "host_mmio">;
 

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -428,12 +428,6 @@ def TT_IteratorTypeAttr : EnumAttr<TT_Dialect, TT_IteratorType, "iterator_type">
 
 def TT_IteratorTypeArrayAttr : TypedArrayAttrBase<TT_IteratorTypeAttr, "">;
 
-def TT_OperandConstraintAttr : EnumAttr<TT_Dialect, TT_OperandConstraint, "operand_constraint"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
-def TT_OperandConstraintArrayAttr : TypedArrayAttrBase<TT_OperandConstraintAttr, "">;
-
 def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> {
   let summary = "Argument allocation attribute in TT dialect";
   let description = [{
@@ -494,4 +488,12 @@ def TT_Device : TT_Type<"Device", "device", []> {
     let assemblyFormat = "`<` $desc `>`";
 }
 
+//===----------------------------------------------------------------------===//
+// Auxiliary type definitions
+//===----------------------------------------------------------------------===//
+
+def TT_Tuple : NestedTupleOf<[AnyRankedTensor]>;
+
+def TT_TupleReturnType : AnyTypeOf<[AnyRankedTensor]>;
+
 #endif
diff --git a/include/ttmlir/Dialect/TT/Utils/OperandConstraints.h b/include/ttmlir/Dialect/TT/Utils/OperandConstraints.h
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -62,7 +62,6 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> {
                          TT_GridAttr:$grid,
                          AffineMapArrayAttr:$indexing_maps,
                          TT_IteratorTypeArrayAttr:$iterator_types,
-                         TT_OperandConstraintArrayAttr:$operand_constraints,
                          DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$operand_cb_mapping); // index of input operand and index of cb go together
     let results = (outs Variadic<AnyRankedTensor>:$results);
     let regions = (region AnyRegion:$region);
@@ -126,11 +125,6 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpI
 
     let extraClassDeclaration = [{
       MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
-      ArrayAttr getOperandConstraints() {
-        return nullptr;
-        // TODO return below, but we need a way to properly create an ArrayAttr:
-        // return {OperandConstraint::Any, OperandConstraint::Any};
-      }
 
       struct CompoundComponents {
         bool isLayoutChange;

diff --git a/include/ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H
+#define TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h"
+#include <cstdint>
+
+namespace mlir::tt::ttnn {
+
+// The goal of this policy is to always solve simple fork-joins if that is
+// possible. Fork-join is considered to be simple if there is no need for DRAM
+// spill in its execution. Furthermore, if DRAM spill is necessary, this policy
+// will not produce globally optimal solution.
+//
+class BFInterleavedPolicy : public MemoryLayoutAnalysisPolicy {
+public:
+  // In order to keep track of the L1 memory usage, we have to know two things
+  // for each op:
+  //    1. The L1 memory usage of each op's output tensor.
+  //    2. The number of op's users currently relying on the op's output tensor.
+  //       This is important for fork ops where the output tensor is used by
+  //       multiple other ops.
+  //
+  struct OpL1MemUsage {
+    uint64_t l1MemUsagePerUser;
+    uint64_t numOfUnscheduledUsers;
+  };
+
+public:
+  BFInterleavedPolicy(
+      Operation *rootOp, std::vector<L1ChainConfig> &l1ChainConfigs,
+      const llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>>
+          &legalLayouts,
+      llvm::DenseMap<func::FuncOp, llvm::SmallVector<Operation *>> &schedule,
+      unsigned usableL1CacheSize)
+      : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts,
+                                   schedule, usableL1CacheSize) {}
+
+  void run() final;
+
+private:
+  // Check if the op is analyzable. Op is analyzable if it has at least one
+  // legal layout.
+  bool isAnalyzable(Operation *op);
+
+  // Iterate over all operands of the op that satisfy the analyzability
+  // criterium defined by the isAnalyzable method. This is an abstraction
+  // for the boilerplate code used in different places within the policy.
+  //
+  void walkOnAnalyzableOperands(Operation *op,
+                                function_ref<void(Operation *)> callback);
+
+  // Fetch op's DRAM layout from legalLayouts.
+  bool hasDRAMBufferType(Operation *op);
+  TTNNLayoutAttr getDRAMLayout(Operation *op);
+
+  // Fetch op's L1 Interleaved layout from legalLayouts.
+  bool hasL1BufferType(Operation *op);
+  TTNNLayoutAttr getL1InterleavedLayout(Operation *op);
+
+  size_t getAvailableL1CacheSize() const {
+    // Figure out this const based on exec data, but will be replaced
+    // with API.
+    //
+    constexpr float tensorL1UsageCap = 0.75;
+    return tensorL1UsageCap * usableL1CacheSize;
+  }
+};
+
+} // namespace mlir::tt::ttnn
+
+#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H
diff --git a/...alect/TTNN/Analysis/L1InterleavedPolicy.h → ...TTNN/Analysis/GreedyL1InterleavedPolicy.h b/...alect/TTNN/Analysis/L1InterleavedPolicy.h → ...TTNN/Analysis/GreedyL1InterleavedPolicy.h
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H
-#define TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H
+#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H
+#define TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h"
@@ -12,7 +12,7 @@
 
 namespace mlir::tt::ttnn {
 
-class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
+class GreedyL1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
 public:
   struct OpMemSpec {
     TTNNLayoutAttr layout;
@@ -46,7 +46,7 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
   };
 
 public:
-  L1InterleavedPolicy(
+  GreedyL1InterleavedPolicy(
       Operation *rootOp, std::vector<L1ChainConfig> &l1ChainConfigs,
       const llvm::DenseMap<Operation *, std::vector<TTNNLayoutAttr>>
           &legalLayouts,
@@ -124,4 +124,4 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy {
 
 } // namespace mlir::tt::ttnn
 
-#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H
+#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H