diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f49354540e..8e23597a4e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -28,5 +28,5 @@ /test/ttmlir/Dialect/TTNN/optimizer/ @nobradovictt @odjuricicTT /test/ttmlir/Silicon/TTNN/optimizer/ @nobradovictt @odjuricicTT /test/unittests/Optimizer @nobradovictt @odjuricicTT -/tools/explorer/ @odjuricicTT @nobradovictt @vprajapati-tt /tools/ @svuckovicTT @mtopalovicTT +/tools/explorer/ @odjuricicTT @nobradovictt @vprajapati-tt diff --git a/docs/src/adding-an-op.md b/docs/src/adding-an-op.md index 74c713f0c7..cbe4a44c10 100644 --- a/docs/src/adding-an-op.md +++ b/docs/src/adding-an-op.md @@ -53,18 +53,13 @@ There are many things to break down here, starting from the top: be critical for modeling buffer allocation / lifetimes. Note the 3rd argument `AnyRankedTensor:$output`. - Next we have a list of `arguments`. These arguments consist of a mixture of - `Type`s (i.e. `AnyRankedTensor`) and `Attribute`s (i.e. `TT_OperandConstraintArrayAttr`). + `Type`s (i.e. `AnyRankedTensor`) and `Attribute`s. [Read more about Types & Attributes here](https://mlir.llvm.org/docs/DefiningDialects/AttributesAndTypes/#attributes). - `AnyRankedTensor` is part of a tablegen standard library which type aliases to MLIR's builtin Tensor type, with the added constraint that the tensor has a static rank. As much as possible we want to use the builtin types and infrastructure provided by MLIR. - - `TT_OperandConstraintArrayAttr` is a custom attribute that we have defined - in the [`TT`](./autogen/md/Dialect/TTDialect.md) dialect. This attribute is - used to specify constraints on the - operands of the operation. For example, the `TTIR_MatmulOp` requires that - the input tensors be in tile layout, this attribute captures this constraint. - Next we have a list of `results` in this case just 1, which aliases the `output` tensor. One drawback of DPS is that the result tensor and the output tensor will appear to have different SSA names in the IR, but they diff --git a/docs/src/overview.md b/docs/src/overview.md index f2e87fa032..b181b822a2 100644 --- a/docs/src/overview.md +++ b/docs/src/overview.md @@ -104,8 +104,7 @@ module attributes {tt.system_desc = #tt.system_desc<[<#tt.arch, #tt defines the type of result - Quotes are added around ttir.multiply since it's part of a - custom dialect, and more custom assembly instructions are - applied to specify operand_constraints. + custom dialect. - Operations typically have operands (arguments) and results which are highlighted with %, these results and operands help to show diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h index 263cd1d8e4..6884eb3869 100644 --- a/include/ttmlir-c/TTAttrs.h +++ b/include/ttmlir-c/TTAttrs.h @@ -69,13 +69,6 @@ ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType); MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTIteratorTypeArrayAttrGet( MlirContext ctx, uint32_t *iteratorTypes, size_t iteratorTypesSize); -MLIR_CAPI_EXPORTED MlirAttribute -ttmlirTTOperandConstraintAttrGet(MlirContext ctx, uint32_t OperandConstraint); - -MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOperandConstraintArrayAttrGet( - MlirContext ctx, uint32_t *OperandConstraints, - size_t OperandConstraintsSize); - MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTTileSizeAttrGet(MlirContext ctx, int64_t y, int64_t x); diff --git a/include/ttmlir/Bindings/Python/TTMLIRModule.h b/include/ttmlir/Bindings/Python/TTMLIRModule.h index d36529e676..49ec0e912b 100644 --- a/include/ttmlir/Bindings/Python/TTMLIRModule.h +++ b/include/ttmlir/Bindings/Python/TTMLIRModule.h @@ -62,6 +62,7 @@ void populateTTNNModule(py::module &m); void populateOverridesModule(py::module &m); void populateOptimizerOverridesModule(py::module &m); void populatePassesModule(py::module &m); +void populateUtilModule(py::module &m); } // namespace mlir::ttmlir::python #endif // TTMLIR_BINDINGS_PYTHON_TTMLIRMODULE_H diff --git a/include/ttmlir/Dialect/TT/IR/TTOps.h b/include/ttmlir/Dialect/TT/IR/TTOps.h index 047fc2a3c4..16219c04e3 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOps.h +++ b/include/ttmlir/Dialect/TT/IR/TTOps.h @@ -9,6 +9,7 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/OpDefinition.h" +#include "mlir/IR/TypeUtilities.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" diff --git a/include/ttmlir/Dialect/TT/IR/TTOps.td b/include/ttmlir/Dialect/TT/IR/TTOps.td index d3b34fff88..9078028ab1 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOps.td +++ b/include/ttmlir/Dialect/TT/IR/TTOps.td @@ -6,5 +6,30 @@ #define TTMLIR_TTMLIR_TTOPS_TD include "ttmlir/Dialect/TT/IR/TTOpsTypes.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/CommonTypeConstraints.td" + +def TT_GetTupleElementOp: TT_Op<"get_tuple_element", [Pure, DeclareOpInterfaceMethods]> { + let summary = "GetTupleElement operation"; + let description = [{ + Extracts element at `index` position of the `operand` tuple and produces a `result`. + + Example: + ```mlir + %result = tt.get_tuple_element %operand[0] : (tuple, tuple>>) -> tensor<32x32xbf16> + ``` + }]; + + let arguments = (ins TT_Tuple:$operand, + ConfinedAttr:$index + ); + + let results = (outs TT_TupleReturnType:$result); + + let assemblyFormat = [{ + $operand `[` $index `]` attr-dict `:` functional-type(operands, results) + }]; +} #endif diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td index aee19f63c6..9ffa63dade 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsEnums.td @@ -126,47 +126,6 @@ def TT_OOBVal : I32EnumAttr<"OOBVal", "TT OOBVal", let cppNamespace = "::mlir::tt"; } -def TT_OperandConstraintSystem : I32BitEnumAttrCaseBit<"System", 0, "system">; -def TT_OperandConstraintDRAM : I32BitEnumAttrCaseBit<"DRAM", 1, "dram">; -def TT_OperandConstraintL1 : I32BitEnumAttrCaseBit<"L1", 2, "l1">; -def TT_OperandConstraintScalar : I32BitEnumAttrCaseBit<"Scalar", 3, "scalar">; -def TT_OperandConstraintTile : I32BitEnumAttrCaseBit<"Tile", 4, "tile">; -def TT_OperandConstraintNone : I32BitEnumAttrCaseBit<"None", 5, "none">; -def TT_OperandConstraintInterleaved : I32BitEnumAttrCaseBit<"Interleaved", 6, "interleaved">; -def TT_OperandConstraintSingleBank : I32BitEnumAttrCaseBit<"SingleBank", 7, "single_bank">; -def TT_OperandConstraintHeightSharded : I32BitEnumAttrCaseBit<"HeightSharded", 8, "height_sharded">; -def TT_OperandConstraintWidthSharded : I32BitEnumAttrCaseBit<"WidthSharded", 9, "width_sharded">; -def TT_OperandConstraintBlockSharded : I32BitEnumAttrCaseBit<"BlockSharded", 10, "block_sharded">; -def TT_OperandConstraintSystemScalar : I32BitEnumAttrCaseGroup<"SystemScalar", [TT_OperandConstraintSystem, TT_OperandConstraintScalar], "system_scalar">; -def TT_OperandConstraintAnyLayout : I32BitEnumAttrCaseGroup<"AnyLayout", [TT_OperandConstraintNone, TT_OperandConstraintInterleaved, TT_OperandConstraintSingleBank, TT_OperandConstraintHeightSharded, TT_OperandConstraintWidthSharded, TT_OperandConstraintBlockSharded], "any_layout">; -def TT_OperandConstraintAny : I32BitEnumAttrCaseGroup<"Any", [TT_OperandConstraintSystem, TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any">; -def TT_OperandConstraintAnyDevice : I32BitEnumAttrCaseGroup<"AnyDevice", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device">; -def TT_OperandConstraintAnyDeviceTile : I32BitEnumAttrCaseGroup<"AnyDeviceTile", [TT_OperandConstraintDRAM, TT_OperandConstraintL1, TT_OperandConstraintTile, TT_OperandConstraintAnyLayout], "any_device_tile">; -def TT_OperandConstraintL1BlockSharded : I32BitEnumAttrCaseGroup<"L1BlockSharded", [TT_OperandConstraintL1, TT_OperandConstraintScalar, TT_OperandConstraintTile, TT_OperandConstraintBlockSharded], "l1_block_sharded">; -def TT_OperandConstraint : I32BitEnumAttr<"OperandConstraint", "TT Operand Constraints", - [ - TT_OperandConstraintSystem, - TT_OperandConstraintDRAM, - TT_OperandConstraintL1, - TT_OperandConstraintScalar, - TT_OperandConstraintTile, - TT_OperandConstraintNone, - TT_OperandConstraintInterleaved, - TT_OperandConstraintSingleBank, - TT_OperandConstraintHeightSharded, - TT_OperandConstraintWidthSharded, - TT_OperandConstraintBlockSharded, - TT_OperandConstraintSystemScalar, - TT_OperandConstraintAnyLayout, - TT_OperandConstraintAny, - TT_OperandConstraintAnyDevice, - TT_OperandConstraintAnyDeviceTile, - TT_OperandConstraintL1BlockSharded, - ]> { - let genSpecializedAttr = 0; - let cppNamespace = "::mlir::tt"; -} - def TT_ChipCapabilityPCIE : I32BitEnumAttrCaseBit<"PCIE", 0, "pcie">; def TT_ChipCapabilityHostMMIO : I32BitEnumAttrCaseBit<"HostMMIO", 1, "host_mmio">; diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td index d5dc22e28d..7472c298b7 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td @@ -428,12 +428,6 @@ def TT_IteratorTypeAttr : EnumAttr def TT_IteratorTypeArrayAttr : TypedArrayAttrBase; -def TT_OperandConstraintAttr : EnumAttr { - let assemblyFormat = "`<` $value `>`"; -} - -def TT_OperandConstraintArrayAttr : TypedArrayAttrBase; - def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> { let summary = "Argument allocation attribute in TT dialect"; let description = [{ @@ -494,4 +488,12 @@ def TT_Device : TT_Type<"Device", "device", []> { let assemblyFormat = "`<` $desc `>`"; } +//===----------------------------------------------------------------------===// +// Auxiliary type definitions +//===----------------------------------------------------------------------===// + +def TT_Tuple : NestedTupleOf<[AnyRankedTensor]>; + +def TT_TupleReturnType : AnyTypeOf<[AnyRankedTensor]>; + #endif diff --git a/include/ttmlir/Dialect/TT/Utils/OperandConstraints.h b/include/ttmlir/Dialect/TT/Utils/OperandConstraints.h deleted file mode 100644 index b6147102d9..0000000000 --- a/include/ttmlir/Dialect/TT/Utils/OperandConstraints.h +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef TTMLIR_DIALECT_TT_UTILS_OPERANDCONSTRAINTS_H -#define TTMLIR_DIALECT_TT_UTILS_OPERANDCONSTRAINTS_H - -#include "ttmlir/Dialect/TT/IR/TT.h" -#include "ttmlir/Dialect/TTIR/IR/TTIROps.h" - -namespace mlir::tt { - -inline OperandConstraint -memorySpaceAsOperandConstraint(MemorySpace memorySpace) { - switch (memorySpace) { - case MemorySpace::System: - case MemorySpace::SystemMMIO: - return OperandConstraint::System; - case MemorySpace::DeviceDRAM: - return OperandConstraint::DRAM; - case MemorySpace::DeviceL1: - return OperandConstraint::L1; - } -} - -inline OperandConstraint -memoryLayoutAsOperandConstraint(TensorMemoryLayout memoryLayout) { - switch (memoryLayout) { - case TensorMemoryLayout::None: - return OperandConstraint::None; - case TensorMemoryLayout::Interleaved: - return OperandConstraint::Interleaved; - case TensorMemoryLayout::SingleBank: - return OperandConstraint::SingleBank; - case TensorMemoryLayout::HeightSharded: - return OperandConstraint::HeightSharded; - case TensorMemoryLayout::WidthSharded: - return OperandConstraint::WidthSharded; - case TensorMemoryLayout::BlockSharded: - return OperandConstraint::BlockSharded; - } -} - -inline MemorySpace getLegalMemorySpace(OperandConstraint operandConstraint, - MemorySpace defaultMemorySpace) { - if (bitEnumContainsAny(operandConstraint, - memorySpaceAsOperandConstraint(defaultMemorySpace))) { - return defaultMemorySpace; - } - if (bitEnumContainsAny(operandConstraint, OperandConstraint::DRAM)) { - return MemorySpace::DeviceDRAM; - } - if (bitEnumContainsAny(operandConstraint, OperandConstraint::L1)) { - return MemorySpace::DeviceL1; - } - return MemorySpace::System; -} - -inline TensorMemoryLayout -getLegalTensorMemoryLayout(OperandConstraint operandConstraint, - MemorySpace targetMemorySpace, - TensorMemoryLayout defaultDeviceMemLayout) { - if (defaultDeviceMemLayout == TensorMemoryLayout::None) { - return TensorMemoryLayout::None; - } - - if (isSystemMemorySpace(targetMemorySpace)) { - return TensorMemoryLayout::None; - } - - assert(isDeviceMemorySpace(targetMemorySpace)); - if (bitEnumContainsAny(operandConstraint, memoryLayoutAsOperandConstraint( - defaultDeviceMemLayout))) { - return defaultDeviceMemLayout; - } - - std::map validLayoutsMap = { - {OperandConstraint::Interleaved, TensorMemoryLayout::Interleaved}, - {OperandConstraint::SingleBank, TensorMemoryLayout::SingleBank}, - {OperandConstraint::HeightSharded, TensorMemoryLayout::HeightSharded}, - {OperandConstraint::WidthSharded, TensorMemoryLayout::WidthSharded}, - {OperandConstraint::BlockSharded, TensorMemoryLayout::BlockSharded}}; - - for (const auto &[constraintLayout, memLayout] : validLayoutsMap) { - if (bitEnumContainsAny(operandConstraint, constraintLayout)) { - return memLayout; - } - } - - return TensorMemoryLayout::None; -} - -} // namespace mlir::tt - -#endif // TTMLIR_DIALECT_TT_UTILS_OPERANDCONSTRAINTS_H diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td index ff1cc61bed..e685172c24 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td @@ -62,7 +62,6 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> { TT_GridAttr:$grid, AffineMapArrayAttr:$indexing_maps, TT_IteratorTypeArrayAttr:$iterator_types, - TT_OperandConstraintArrayAttr:$operand_constraints, DefaultValuedOptionalAttr:$operand_cb_mapping); // index of input operand and index of cb go together let results = (outs Variadic:$results); let regions = (region AnyRegion:$region); @@ -126,11 +125,6 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpI let extraClassDeclaration = [{ MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); } - ArrayAttr getOperandConstraints() { - return nullptr; - // TODO return below, but we need a way to properly create an ArrayAttr: - // return {OperandConstraint::Any, OperandConstraint::Any}; - } struct CompoundComponents { bool isLayoutChange; diff --git a/include/ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h new file mode 100644 index 0000000000..1744a1d415 --- /dev/null +++ b/include/ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H +#define TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H + +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" +#include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h" +#include + +namespace mlir::tt::ttnn { + +// The goal of this policy is to always solve simple fork-joins if that is +// possible. Fork-join is considered to be simple if there is no need for DRAM +// spill in its execution. Furthermore, if DRAM spill is necessary, this policy +// will not produce globally optimal solution. +// +class BFInterleavedPolicy : public MemoryLayoutAnalysisPolicy { +public: + // In order to keep track of the L1 memory usage, we have to know two things + // for each op: + // 1. The L1 memory usage of each op's output tensor. + // 2. The number of op's users currently relying on the op's output tensor. + // This is important for fork ops where the output tensor is used by + // multiple other ops. + // + struct OpL1MemUsage { + uint64_t l1MemUsagePerUser; + uint64_t numOfUnscheduledUsers; + }; + +public: + BFInterleavedPolicy( + Operation *rootOp, std::vector &l1ChainConfigs, + const llvm::DenseMap> + &legalLayouts, + llvm::DenseMap> &schedule, + unsigned usableL1CacheSize) + : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts, + schedule, usableL1CacheSize) {} + + void run() final; + +private: + // Check if the op is analyzable. Op is analyzable if it has at least one + // legal layout. + bool isAnalyzable(Operation *op); + + // Iterate over all operands of the op that satisfy the analyzability + // criterium defined by the isAnalyzable method. This is an abstraction + // for the boilerplate code used in different places within the policy. + // + void walkOnAnalyzableOperands(Operation *op, + function_ref callback); + + // Fetch op's DRAM layout from legalLayouts. + bool hasDRAMBufferType(Operation *op); + TTNNLayoutAttr getDRAMLayout(Operation *op); + + // Fetch op's L1 Interleaved layout from legalLayouts. + bool hasL1BufferType(Operation *op); + TTNNLayoutAttr getL1InterleavedLayout(Operation *op); + + size_t getAvailableL1CacheSize() const { + // Figure out this const based on exec data, but will be replaced + // with API. + // + constexpr float tensorL1UsageCap = 0.75; + return tensorL1UsageCap * usableL1CacheSize; + } +}; + +} // namespace mlir::tt::ttnn + +#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_BFINTERLEAVEDPOLICY_H diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h similarity index 93% rename from include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h rename to include/ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h index 2392cd7c9c..b7325332e9 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H -#define TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H +#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H +#define TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H #include "mlir/Dialect/Func/IR/FuncOps.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" @@ -12,7 +12,7 @@ namespace mlir::tt::ttnn { -class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { +class GreedyL1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { public: struct OpMemSpec { TTNNLayoutAttr layout; @@ -46,7 +46,7 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { }; public: - L1InterleavedPolicy( + GreedyL1InterleavedPolicy( Operation *rootOp, std::vector &l1ChainConfigs, const llvm::DenseMap> &legalLayouts, @@ -124,4 +124,4 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { } // namespace mlir::tt::ttnn -#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H +#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_GREEDYL1INTERLEAVEDPOLICY_H diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h index b8aee2e4ea..32ab07a958 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h @@ -5,9 +5,7 @@ #ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_L1CHAINCONFIG_H #define TTMLIR_DIALECT_TTNN_ANALYSIS_L1CHAINCONFIG_H -#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" #include "ttmlir/Dialect/TTNN/Analysis/ShardSolver.h" -#include namespace mlir::tt::ttnn { @@ -19,7 +17,7 @@ struct OpL1MemSpec { // Tensor split factor for the output tensor of the op(working with a partial // tensor). // - uint tensorSplitFactor; + uint tensorSplitFactor = 1; // Layout of the output tensor of the op. // @@ -56,6 +54,7 @@ class L1ChainConfig { void complete(const llvm::DenseMap &selectedOpLayout, std::unordered_set &memReconfigEdges); + void complete(); bool isEmpty() { return opL1MemSpecs.empty(); } void addOpL1MemSpec(OpL1MemSpec spec) { @@ -70,6 +69,9 @@ class L1ChainConfig { const std::unordered_set &getMemReconfigEdges() const { return memReconfigEdges; } + + uint64_t size() const { return opL1MemSpecs.size(); } + void merge(L1ChainConfig &other); }; } // namespace mlir::tt::ttnn diff --git a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h index 4f6fcd8f47..33477a798a 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h @@ -17,6 +17,7 @@ class MemoryLayoutAnalysisPolicy { llvm::DenseMap> legalLayouts; llvm::DenseMap> *schedule; unsigned usableL1CacheSize = 0; + DeviceAttr deviceAttr; public: virtual ~MemoryLayoutAnalysisPolicy() {}; diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td index 99a9bed24f..4597db87e1 100644 --- a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td +++ b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td @@ -86,4 +86,36 @@ def TTNNCreateInputGenerators: Pass<"ttnn-create-input-gens", "::mlir::ModuleOp" }]; } +def TTNNModifySignaturesForDylib: Pass<"ttnn-modify-signatures-for-dylib", "::mlir::ModuleOp"> { + let summary = "Modify signatures of the functions for dylib path."; + let description = [{ + This pass is intended to be used only when the end result is a dylib! + + It modifies signatures of forward functions so that they take a canonical + form. Essentially, input tensors are packed into a tuple and then accessed + in the function body. This allows for easier interfacing with the generated + dylib as the signatures are then uniform across all forward functions. + + Given a forward function like this: + + ```mlir + func.func @add(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> { + %0 = "ttnn.add"(%arg0, %arg1) : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %0 : tensor<32x32xbf16> + } + ``` + + The pass will modify the signature and prepend unpacking ops like so: + + ```mlir + func.func @add(%arg0: tuple, tensor<32x32xbf16>>) -> tensor<32x32xbf16> { + %0 = tt.get_tuple_element %arg0[0] : (tuple, tensor<32x32xbf16>>) -> tensor<32x32xbf16> + %1 = tt.get_tuple_element %arg0[1] : (tuple, tensor<32x32xbf16>>) -> tensor<32x32xbf16> + %2 = "ttnn.add"(%0, %1) : (tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %2 : tensor<32x32xbf16> + } + ``` + }]; +} + #endif diff --git a/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h index 5275e2340d..945afda812 100644 --- a/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h +++ b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h @@ -10,7 +10,11 @@ namespace mlir::tt { -enum class MemoryLayoutAnalysisPolicyType { DFSharding, L1Interleaved }; +enum class MemoryLayoutAnalysisPolicyType { + DFSharding, + GreedyL1Interleaved, + BFInterleaved +}; struct MemoryLayoutAnalysisPolicyTypeParser : public llvm::cl::parser { @@ -22,8 +26,10 @@ struct MemoryLayoutAnalysisPolicyTypeParser llvm::StringRef arg, MemoryLayoutAnalysisPolicyType &value) { value = llvm::StringSwitch(arg) .Case("DFSharding", MemoryLayoutAnalysisPolicyType::DFSharding) - .Case("L1Interleaved", - MemoryLayoutAnalysisPolicyType::L1Interleaved); + .Case("GreedyL1Interleaved", + MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved) + .Case("BFInterleaved", + MemoryLayoutAnalysisPolicyType::BFInterleaved); return false; } @@ -33,8 +39,11 @@ struct MemoryLayoutAnalysisPolicyTypeParser case MemoryLayoutAnalysisPolicyType::DFSharding: res += "DFSharding"; break; - case MemoryLayoutAnalysisPolicyType::L1Interleaved: - res += "L1Interleaved"; + case MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved: + res += "GreedyL1Interleaved"; + break; + case MemoryLayoutAnalysisPolicyType::BFInterleaved: + res += "BFInterleaved"; break; } return res; diff --git a/include/ttmlir/Dialect/TTNN/Utils/Utils.h b/include/ttmlir/Dialect/TTNN/Utils/Utils.h index f214fa793d..d3fb76bda9 100644 --- a/include/ttmlir/Dialect/TTNN/Utils/Utils.h +++ b/include/ttmlir/Dialect/TTNN/Utils/Utils.h @@ -43,6 +43,12 @@ RankedTensorType createRankedTensorTypeWithEncoding(RankedTensorType tensorType, ttnn::TTNNLayoutAttr encoding); +// Return the L1 memory usage of the output tensor of the given op. +// Used within L1 interleaved policies. +// +uint64_t getOpOutputL1Usage(Operation *op, TTNNLayoutAttr opLayout, + DeviceAttr &deviceAttr); + } // namespace mlir::tt::ttnn::utils #endif // TTMLIR_DIALECT_TTNN_UTILS_UTILS_H diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h index 49dad79e5e..ec7838b2f3 100644 --- a/include/ttmlir/Utils.h +++ b/include/ttmlir/Utils.h @@ -92,8 +92,7 @@ std::string join(const llvm::SmallVector &vec, // This util function can be used as a helper to create an attribute from an // array of attributes for any type defined like for example: // -// `def TT_OperandConstraintArrayAttr -// : TypedArrayAttrBase;` +// def TT_IteratorTypeArrayAttr : TypedArrayAttrBase; // // since these don't get any special Cpp class generated for them from // tablegen. diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp index c329f41d56..ae2bb49d55 100644 --- a/lib/CAPI/TTAttrs.cpp +++ b/lib/CAPI/TTAttrs.cpp @@ -167,28 +167,6 @@ MlirAttribute ttmlirTTIteratorTypeArrayAttrGet(MlirContext ctx, return wrap(ArrayAttr::get(unwrap(ctx), iteratorTypesArray)); } -MlirAttribute ttmlirTTOperandConstraintAttrGet(MlirContext ctx, - uint32_t operandConstraint) { - return wrap(OperandConstraintAttr::get( - unwrap(ctx), static_cast(operandConstraint))); -} - -MlirAttribute -ttmlirTTOperandConstraintArrayAttrGet(MlirContext ctx, - uint32_t *operandConstraints, - size_t operandConstraintsSize) { - std::vector operandConstraintsEnumArray( - operandConstraints, operandConstraints + operandConstraintsSize); - std::vector operandConstraintsArray; - - for (auto operandEnum : operandConstraintsEnumArray) { - operandConstraintsArray.push_back(OperandConstraintAttr::get( - unwrap(ctx), static_cast(operandEnum))); - } - - return wrap(ArrayAttr::get(unwrap(ctx), operandConstraintsArray)); -} - MlirAttribute ttmlirTTTileSizeAttrGet(MlirContext ctx, int64_t y, int64_t x) { return wrap(TileSizeAttr::get(unwrap(ctx), y, x)); } diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp index b1d438b905..c1a07b5fcd 100644 --- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp +++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp @@ -5,6 +5,7 @@ #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h" #include "ttmlir/Conversion/TTNNToEmitC/Utils.h" +#include "ttmlir/Dialect/TT/IR/TTOps.h" #include "ttmlir/Dialect/TT/IR/TTOpsDialect.h.inc" #include "ttmlir/Dialect/TTNN/IR/TTNN.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" @@ -576,6 +577,42 @@ class ArithConstantOpConversionPattern } }; +class GetTupleElementOpConversionPattern + : public OpConversionPattern { + +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(tt::GetTupleElementOp getTupleElementOp, + tt::GetTupleElementOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // SubscriptOp requires a Value object as index, which is created by + // invoking the emitc::LiteralOp + // + Value indexAsVal = rewriter.create( + getTupleElementOp->getLoc(), rewriter.getIndexType(), + std::to_string(adaptor.getIndex())); + + // SubscriptOp also returns an emitc::LValueType, so we wrap the OpaqueType + // with LValueType + // + emitc::LValueType lvalueReturnType = emitc::LValueType::get( + emitc::OpaqueType::get(rewriter.getContext(), "ttnn::Tensor")); + Value subscript = rewriter.create( + getTupleElementOp->getLoc(), lvalueReturnType, adaptor.getOperand(), + indexAsVal); + + // As SubscriptOp returns an LValueType, we need to convert it to an + // OpaqueType - this is done by invoking the emitc::LoadOp + // + rewriter.replaceOpWithNewOp( + getTupleElementOp, emitc::OpaqueType::get(getContext(), "ttnn::Tensor"), + subscript); + return success(); + } +}; + // Module Op conversion pattern // // This conversion pattern removes attributes from the ModuleOp. Previously, @@ -724,10 +761,6 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx, patterns.add>(typeConverter, ctx); - // Module op - // - patterns.add(typeConverter, ctx); - // KV Cache ops // patterns.add>(typeConverter, @@ -738,6 +771,14 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx, // Arith ops // patterns.add(typeConverter, ctx); + + // Module op + // + patterns.add(typeConverter, ctx); + + // Tuple ops + // + patterns.add(typeConverter, ctx); } } // namespace mlir::tt diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp index bd0c9044fc..95e722d846 100644 --- a/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp +++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/MLIRContext.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/DialectConversion.h" @@ -40,6 +41,9 @@ class TTNNToEmitCTypeConverter : public TypeConverter { addConversion([ctx](mlir::TensorType type) -> emitc::OpaqueType { return emitc::OpaqueType::get(ctx, "ttnn::Tensor"); }); + addConversion([ctx](mlir::TupleType type) -> emitc::OpaqueType { + return emitc::OpaqueType::get(ctx, "std::vector"); + }); } }; diff --git a/lib/Dialect/LLVM/Transforms/EmitHelperFuncs.cpp b/lib/Dialect/LLVM/Transforms/EmitHelperFuncs.cpp index cd54986f65..19cdaf8e98 100644 --- a/lib/Dialect/LLVM/Transforms/EmitHelperFuncs.cpp +++ b/lib/Dialect/LLVM/Transforms/EmitHelperFuncs.cpp @@ -41,7 +41,8 @@ void generateLLVMHelpersForArgRanks(mlir::ModuleOp moduleOp) { builder.setInsertionPointToEnd(moduleOp.getBody()); // Define the helper function name and type - std::string helperName = func.getName().str() + "_helper"; + llvm::SmallString<32> helperName(func.getName()); + helperName.append("_helper"); // Create the helper function auto helperFuncType = LLVM::LLVMFunctionType::get( diff --git a/lib/Dialect/TT/IR/TTDialect.cpp b/lib/Dialect/TT/IR/TTDialect.cpp index 1ac8a22239..764e951432 100644 --- a/lib/Dialect/TT/IR/TTDialect.cpp +++ b/lib/Dialect/TT/IR/TTDialect.cpp @@ -31,21 +31,6 @@ struct TTOpAsmDialectInterface : public OpAsmDialectInterface { os << mlir::cast(attr).getValue(); return AliasResult::OverridableAlias; } - if (llvm::isa(attr)) { - auto value = mlir::cast(attr).getValue(); - if (value == OperandConstraint::Any) { - os << "any"; - } else if (value == OperandConstraint::AnyDevice) { - os << "any_device"; - } else if (value == OperandConstraint::AnyDeviceTile) { - os << "any_device_tile"; - } else if (value == OperandConstraint::L1BlockSharded) { - os << "l1_block_sharded"; - } else { - os << "operand_constraint"; - } - return AliasResult::OverridableAlias; - } if (llvm::isa(attr)) { os << "device"; return AliasResult::OverridableAlias; diff --git a/lib/Dialect/TT/IR/TTOps.cpp b/lib/Dialect/TT/IR/TTOps.cpp index 6f15f813ef..b4f3b951d6 100644 --- a/lib/Dialect/TT/IR/TTOps.cpp +++ b/lib/Dialect/TT/IR/TTOps.cpp @@ -7,3 +7,28 @@ #define GET_OP_CLASSES #include "ttmlir/Dialect/TT/IR/TTOps.cpp.inc" + +namespace mlir::tt { + +LogicalResult GetTupleElementOp::inferReturnTypes( + MLIRContext *, std::optional location, ValueRange operands, + DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions, + SmallVectorImpl &inferredReturnTypes) { + + GetTupleElementOp::Adaptor adaptor(operands, attributes, properties, regions); + + auto operandType = dyn_cast(adaptor.getOperand().getType()); + if (!operandType) { + return failure(); + } + if (adaptor.getIndex() >= static_cast(operandType.size())) { + return emitOptionalError(location, "index ", adaptor.getIndex(), + " is out of bounds of operand with size ", + operandType.size()); + } + + inferredReturnTypes.push_back(operandType.getType(adaptor.getIndex())); + return success(); +} + +} // namespace mlir::tt diff --git a/lib/Dialect/TTIR/Transforms/Generic.cpp b/lib/Dialect/TTIR/Transforms/Generic.cpp index 15064ed346..6fc3eb83b3 100644 --- a/lib/Dialect/TTIR/Transforms/Generic.cpp +++ b/lib/Dialect/TTIR/Transforms/Generic.cpp @@ -149,23 +149,6 @@ class TTIRKernelGenericRewriter : public OpRewritePattern { llvm_unreachable("Unsupported kernel kind"); } - static ArrayAttr createOperandConstraints(PatternRewriter &rewriter, - StringRef kind, - mlir::OperandRange operands) { - auto numOperands = operands.size(); - if (kind == "eltwise") { - return rewriter.getArrayAttr(SmallVector( - numOperands, rewriter.getAttr( - OperandConstraint::AnyDevice))); - } - if (kind == "matmul") { - return rewriter.getArrayAttr(SmallVector( - numOperands, rewriter.getAttr( - OperandConstraint::AnyDeviceTile))); - } - llvm_unreachable("Unsupported kernel kind"); - } - LogicalResult matchAndRewrite(KernelOp op, PatternRewriter &rewriter) const final { if (mlir::isa(op.getOperation()->getParentOp())) { @@ -175,12 +158,10 @@ class TTIRKernelGenericRewriter : public OpRewritePattern { // Create a dispatch op auto [indexingMaps, iteratorTypes] = createIndexingMaps(rewriter, op.getKind(), op.getOperands()); - auto constraints = - createOperandConstraints(rewriter, op.getKind(), op.getOperands()); auto dispatch = rewriter.create( op.getLoc(), op.getResults().getTypes(), op.getInputs(), ValueRange() /* cbs */, op.getOutputs(), rewriter.getAttr(), - indexingMaps, iteratorTypes, constraints); + indexingMaps, iteratorTypes); // Create a new basic block for the dispatch op and create block arguments Block *block = rewriter.createBlock(&dispatch.getRegion()); @@ -246,9 +227,6 @@ class TTIRGenericRegionRewriter // Create a generic op. auto [indexingMaps, iteratorTypes] = op.getIndexingMaps(rewriter); - auto constraints = rewriter.getArrayAttr(SmallVector( - op->getNumOperands(), rewriter.getAttr( - OperandConstraint::AnyDeviceTile))); // For testing purposes try getting grid of the resulting tensor and put the // op in the grid. @@ -264,7 +242,7 @@ class TTIRGenericRegionRewriter auto genericOp = rewriter.create( op.getLoc(), op->getResults().getTypes(), dps.getDpsInputs(), ValueRange() /* cbs */, dps.getDpsInits(), gridAttr, indexingMaps, - iteratorTypes, constraints); + iteratorTypes); // Create a new basic block for the generic op and create block arguments. Block *block = rewriter.createBlock(&genericOp.getRegion()); @@ -439,14 +417,8 @@ class TTIRGenericOpCBsRewriter : public OpRewritePattern { SmallVector cbValues; SmallVector operandCBMapping; - SmallVector oldConstraints; - SmallVector cbConstraints; - size_t i = 0; for (auto operand : generic->getOperands()) { - size_t operandIdx = i++; - oldConstraints.push_back(generic.getOperandConstraints()[operandIdx]); - auto ty = mlir::cast(operand.getType()); // Enforcing tiled layout as in kernel we always want to work with tiles. @@ -472,42 +444,14 @@ class TTIRGenericOpCBsRewriter : public OpRewritePattern { generic->getLoc(), ty.getShape(), ty.getElementType(), desiredLayout); cbValues.push_back(emptyOp.getResult()); operandCBMapping.push_back(cbValues.size() - 1); - - // Inheriting constraints from the original operand. - // OperandConstraint inherittedConstraint = - // mlir::cast( - // generic.getOperandConstraints()[operandIdx]) - // .getValue(); - // inherittedConstraint = - // bitEnumSet(inherittedConstraint, OperandConstraint::L1); - // inherittedConstraint = - // bitEnumClear(inherittedConstraint, OperandConstraint::DRAM); - // inherittedConstraint = - // bitEnumClear(inherittedConstraint, OperandConstraint::System); - - // Fixing constraint to L1 for the CB operand. - // TODO(rpavlovic) remove or use code above when we decide on the operand - // constraints model. - cbConstraints.push_back( - rewriter.getAttr(OperandConstraint::L1)); } - SmallVector combinedConstraints; - combinedConstraints.append(oldConstraints.begin(), - oldConstraints.begin() + - generic.getInputs().size()); - combinedConstraints.append(cbConstraints.begin(), cbConstraints.end()); - combinedConstraints.append(oldConstraints.begin() + - generic.getInputs().size(), - oldConstraints.end()); - auto newConstraintsArray = rewriter.getArrayAttr(combinedConstraints); - rewriter.setInsertionPointAfter(generic); auto newGenericOp = rewriter.create( generic->getLoc(), generic.getResultTypes(), generic.getInputs(), cbValues, generic.getOutputs(), generic.getGrid(), generic.getIndexingMaps(), generic.getIteratorTypes(), - newConstraintsArray, operandCBMapping); + operandCBMapping); auto &oldRegion = generic.getRegion(); newGenericOp->getRegion(0).takeBody(oldRegion); diff --git a/lib/Dialect/TTIR/Transforms/Layout.cpp b/lib/Dialect/TTIR/Transforms/Layout.cpp index eca9747304..3dc87f03a2 100644 --- a/lib/Dialect/TTIR/Transforms/Layout.cpp +++ b/lib/Dialect/TTIR/Transforms/Layout.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttmlir/Dialect/TT/IR/TT.h" -#include "ttmlir/Dialect/TT/Utils/OperandConstraints.h" +#include "ttmlir/Dialect/TTIR/IR/TTIROps.h" #include "ttmlir/Dialect/TTIR/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" diff --git a/lib/Dialect/TTNN/Analysis/BFInterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/BFInterleavedPolicy.cpp new file mode 100644 index 0000000000..4d58d7b5aa --- /dev/null +++ b/lib/Dialect/TTNN/Analysis/BFInterleavedPolicy.cpp @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h" +#include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" +#include "ttmlir/Dialect/TTNN/Utils/Utils.h" +#include "ttmlir/Scheduler/Scheduler.h" + +namespace mlir::tt::ttnn { + +void BFInterleavedPolicy::run() { + for (Operation &funcOp : rootOp->getRegion(0).getOps()) { + func::FuncOp func = dyn_cast(funcOp); + mlir::tt::scheduler::Scheduler scheduler(&func); + deviceAttr = getCurrentScopeDevice(func); + + // Initialize the policy. + // + llvm::DenseMap currentL1UsagePerOp; + uint64_t currentL1Usage = 0; + l1ChainConfigs->push_back(L1ChainConfig()); + + while (scheduler.hasUnscheduledOps()) { + uint64_t minimalChangeInL1Usage; + Operation *nextOpForScheduling; + BufferType nextOpForSchedulingBufferType; + + nextOpForScheduling = nullptr; + minimalChangeInL1Usage = std::numeric_limits::max(); + for (Operation *op : scheduler.getScheduleableOps()) { + uint64_t deallocOfL1Mem, allocOfL1Mem, changeInL1Usage; + BufferType opBufferType; + + // Calculate the L1 memory usage of the op's operands. + // + deallocOfL1Mem = 0; + walkOnAnalyzableOperands(op, [&](Operation *operandOp) { + if (currentL1UsagePerOp.count(operandOp)) { + deallocOfL1Mem += + (currentL1UsagePerOp[operandOp].numOfUnscheduledUsers == 1) * + currentL1UsagePerOp[operandOp].l1MemUsagePerUser; + } + }); + + // Default setup for all DRAM buffer type ops. + // + allocOfL1Mem = 0; + opBufferType = BufferType::DRAM; + + // Analyse the possibility of scheduling the op with L1 memory layout. + // + if (hasL1BufferType(op)) { + TTNNLayoutAttr layout = getL1InterleavedLayout(op); + uint64_t opOutputL1Usage = + utils::getOpOutputL1Usage(op, layout, deviceAttr); + + if (currentL1Usage + opOutputL1Usage <= getAvailableL1CacheSize()) { + allocOfL1Mem = opOutputL1Usage; + opBufferType = BufferType::L1; + } + } + + // Check if the scheduling of the op is consuming the least amount of L1 + // memory among all the scheduleable ops. + // + changeInL1Usage = allocOfL1Mem - deallocOfL1Mem; + if (changeInL1Usage < minimalChangeInL1Usage) { + nextOpForScheduling = op; + nextOpForSchedulingBufferType = opBufferType; + minimalChangeInL1Usage = changeInL1Usage; + } + } + + // In case we picked the L1 layout for the nextOpForScheduling, we need + // to add the OpL1MemSpec to the L1ChainConfig and update the state of L1 + // memory. + // + if (nextOpForSchedulingBufferType == BufferType::L1) { + + // Construct OpL1MemSpec for the nextOpForScheduling. + // + OpL1MemSpec opL1MemSpec; + opL1MemSpec.op = nextOpForScheduling; + opL1MemSpec.layout = getL1InterleavedLayout(nextOpForScheduling); + l1ChainConfigs->back().addOpL1MemSpec(opL1MemSpec); + + // Update the state of L1 memory by allocating the nextOpForScheduling's + // output tensor in L1 memory. + // + uint64_t numOfUsers = std::distance(nextOpForScheduling->user_begin(), + nextOpForScheduling->user_end()); + currentL1UsagePerOp[nextOpForScheduling].l1MemUsagePerUser = + utils::getOpOutputL1Usage(nextOpForScheduling, opL1MemSpec.layout, + deviceAttr); + currentL1UsagePerOp[nextOpForScheduling].numOfUnscheduledUsers = + numOfUsers; + currentL1Usage += + currentL1UsagePerOp[nextOpForScheduling].l1MemUsagePerUser; + } + + // Update the state of L1 memory. + // + walkOnAnalyzableOperands(nextOpForScheduling, [&](Operation *operandOp) { + if (currentL1UsagePerOp.count(operandOp)) { + currentL1UsagePerOp[operandOp].numOfUnscheduledUsers -= 1; + if (currentL1UsagePerOp[operandOp].numOfUnscheduledUsers == 0) { + // Only once we scheduled all the users of the operandOp, we can + // free its output tensor from L1 memory. + // + currentL1Usage -= currentL1UsagePerOp[operandOp].l1MemUsagePerUser; + currentL1UsagePerOp.erase(operandOp); + } + } + }); + + // Schedule the nextOpForScheduling and update currentL1Usage. + // + scheduler.scheduleOp(nextOpForScheduling); + } + + assert(currentL1Usage == 0); + assert(currentL1UsagePerOp.size() == 0); + + (*schedule)[func] = scheduler.getSchedule(); + + // Build, Resolve and Complete all L1ChainConfigs. + // + for (L1ChainConfig &l1ChainConfig : *l1ChainConfigs) { + l1ChainConfig.build(); + l1ChainConfig.resolve(); + l1ChainConfig.complete(); + } + } +} + +bool BFInterleavedPolicy::isAnalyzable(Operation *op) { + // Skip operations that are not analyzed by the LegalGridAnalysis. + // + if (legalLayouts.count(op) > 0) { + // Skip operations that are filterd out by the MemoryLayoutAnalysis. + // + return legalLayouts[op].size() > 0; + } + return false; +} + +void BFInterleavedPolicy::walkOnAnalyzableOperands( + Operation *op, function_ref callback) { + for (auto operand : op->getOperands()) { + // Skip block arguments (%arg0, %arg1, ...) + // + if (::llvm::isa(operand)) { + continue; + } + + Operation *operandOp = operand.getDefiningOp(); + + // Skip non-analyzable operands. + // + if (isAnalyzable(operandOp)) { + callback(operandOp); + } + } +} + +bool BFInterleavedPolicy::hasDRAMBufferType(Operation *op) { + if (legalLayouts.count(op)) { + return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasDRAMBufferType(); + }) != legalLayouts[op].end(); + } + return false; +} + +TTNNLayoutAttr BFInterleavedPolicy::getDRAMLayout(Operation *op) { + assert(hasDRAMBufferType(op)); + auto dramLayoutIter = std::find_if( + legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { return layout.hasDRAMBufferType(); }); + return *dramLayoutIter; +} + +bool BFInterleavedPolicy::hasL1BufferType(Operation *op) { + if (legalLayouts.count(op)) { + return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasInterleavedL1TensorMemoryLayout(); + }) != legalLayouts[op].end(); + } + return false; +} + +TTNNLayoutAttr BFInterleavedPolicy::getL1InterleavedLayout(Operation *op) { + assert(hasL1BufferType(op)); + auto l1InterleaveLayoutIter = + std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasInterleavedL1TensorMemoryLayout(); + }); + return *l1InterleaveLayoutIter; +} + +} // namespace mlir::tt::ttnn diff --git a/lib/Dialect/TTNN/Analysis/CMakeLists.txt b/lib/Dialect/TTNN/Analysis/CMakeLists.txt index 4db2d78b9c..04cf9c0d22 100644 --- a/lib/Dialect/TTNN/Analysis/CMakeLists.txt +++ b/lib/Dialect/TTNN/Analysis/CMakeLists.txt @@ -4,7 +4,8 @@ add_mlir_dialect_library(MLIRTTNNAnalysis MemoryLayoutAnalysis.cpp L1ChainConfig.cpp DFShardingPolicy.cpp - L1InterleavedPolicy.cpp + GreedyL1InterleavedPolicy.cpp + BFInterleavedPolicy.cpp ShardSolver.cpp ADDITIONAL_HEADER_DIRS diff --git a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp index 8d5f22bfc4..bd74c5a00f 100644 --- a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp +++ b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp @@ -10,7 +10,7 @@ namespace mlir::tt::ttnn { void DFShardingPolicy::run() { rootOp->walk([&](func::FuncOp func) { - DeviceAttr deviceAttr = getCurrentScopeDevice(func); + deviceAttr = getCurrentScopeDevice(func); mlir::tt::scheduler::Scheduler scheduler(&func); l1ChainConfigs->push_back(L1ChainConfig()); llvm::SmallVector scheduleableOps; diff --git a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.cpp similarity index 85% rename from lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp rename to lib/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.cpp index 69a07af168..5606132906 100644 --- a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp +++ b/lib/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.cpp @@ -2,33 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" +#include "ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" +#include "ttmlir/Dialect/TTNN/Utils/Utils.h" #include "ttmlir/Scheduler/Scheduler.h" namespace mlir::tt::ttnn { -uint64_t getOpOutputL1Usage(Operation *op, TTNNLayoutAttr opLayout, - DeviceAttr &deviceAttr) { - // In case the opLayout is not in L1 memory space, L1 memory usage is 0. - // - if (opLayout.hasDRAMBufferType()) { - return 0; - } - - // L1 memory usage of the ops without output tensors cannot be calculated. - // So far, this is only false for ttnn.get_device op. - // - assert(mlir::isa(op->getResult(0).getType())); - llvm::ArrayRef opOutputTensorShape = - mlir::cast(op->getResult(0).getType()).getShape(); - - uint64_t opL1OutputUsage = - opLayout.getTensorSizeInBytes(opOutputTensorShape, deviceAttr); - return opL1OutputUsage; -} - -L1InterleavedPolicy::OpConfig L1InterleavedPolicy::getGreedyConfig( +GreedyL1InterleavedPolicy::OpConfig GreedyL1InterleavedPolicy::getGreedyConfig( Operation *baseOp, llvm::DenseMap &opsL1Usage) { uint64_t numOfOps, bitIndex, currentMask; uint64_t currentL1Usage, optimalL1Usage; @@ -146,10 +127,10 @@ L1InterleavedPolicy::OpConfig L1InterleavedPolicy::getGreedyConfig( return optimalConfig; } -void L1InterleavedPolicy::run() { +void GreedyL1InterleavedPolicy::run() { for (Operation &funcOp : rootOp->getRegion(0).getOps()) { func::FuncOp func = dyn_cast(funcOp); - DeviceAttr deviceAttr = getCurrentScopeDevice(func); + deviceAttr = getCurrentScopeDevice(func); // Start the policy. // @@ -185,8 +166,8 @@ void L1InterleavedPolicy::run() { if (op->hasOneUse() && hasL1BufferType(op)) { L1Usage l1Usage; - l1Usage.outputL1Usage = - getOpOutputL1Usage(op, getL1InterleavedLayout(op), deviceAttr); + l1Usage.outputL1Usage = utils::getOpOutputL1Usage( + op, getL1InterleavedLayout(op), deviceAttr); l1Usage.requiredL1Usage = 0; opsL1Usage[op] = l1Usage; } @@ -211,8 +192,8 @@ void L1InterleavedPolicy::run() { // if (operandOpLayout.hasInterleavedL1TensorMemoryLayout()) { L1Usage l1Usage; - l1Usage.outputL1Usage = - getOpOutputL1Usage(operandOp, operandOpLayout, deviceAttr); + l1Usage.outputL1Usage = utils::getOpOutputL1Usage( + operandOp, operandOpLayout, deviceAttr); l1Usage.requiredL1Usage = OpMemSpecMap[operandOp].requiredL1Usage; opsL1Usage[operandOp] = l1Usage; } @@ -271,14 +252,15 @@ void L1InterleavedPolicy::run() { std::max(intermediateRequiredL1Usage, intermediateL1Usage + OpMemSpecMap[operandOp].requiredL1Usage); - intermediateL1Usage += getOpOutputL1Usage( + intermediateL1Usage += utils::getOpOutputL1Usage( operandOp, OpMemSpecMap[operandOp].layout, deviceAttr); } } - OpMemSpecMap[op].requiredL1Usage = std::max( - intermediateRequiredL1Usage, - intermediateL1Usage + - getOpOutputL1Usage(op, OpMemSpecMap[op].layout, deviceAttr)); + OpMemSpecMap[op].requiredL1Usage = + std::max(intermediateRequiredL1Usage, + intermediateL1Usage + + utils::getOpOutputL1Usage( + op, OpMemSpecMap[op].layout, deviceAttr)); } } } @@ -308,8 +290,8 @@ void L1InterleavedPolicy::run() { } } -bool L1InterleavedPolicy::isAnalyzable(Operation *op) { - // Skip operations that are not analyzed by the LegalLayoutAnalysis. +bool GreedyL1InterleavedPolicy::isAnalyzable(Operation *op) { + // Skip operations that are not analyzed by the LegalGridAnalysis. // if (legalLayouts.count(op) > 0) { // Skip operations that are filterd out by the MemoryLayoutAnalysis. @@ -319,14 +301,14 @@ bool L1InterleavedPolicy::isAnalyzable(Operation *op) { return false; } -bool L1InterleavedPolicy::hasDRAMBufferType(Operation *op) { +bool GreedyL1InterleavedPolicy::hasDRAMBufferType(Operation *op) { return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), [](TTNNLayoutAttr layout) { return layout.hasDRAMBufferType(); }) != legalLayouts[op].end(); } -TTNNLayoutAttr L1InterleavedPolicy::getDRAMLayout(Operation *op) { +TTNNLayoutAttr GreedyL1InterleavedPolicy::getDRAMLayout(Operation *op) { assert(hasDRAMBufferType(op)); auto dramLayoutIter = std::find_if( legalLayouts[op].begin(), legalLayouts[op].end(), @@ -334,14 +316,15 @@ TTNNLayoutAttr L1InterleavedPolicy::getDRAMLayout(Operation *op) { return *dramLayoutIter; } -bool L1InterleavedPolicy::hasL1BufferType(Operation *op) { +bool GreedyL1InterleavedPolicy::hasL1BufferType(Operation *op) { return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), [](TTNNLayoutAttr layout) { return layout.hasInterleavedL1TensorMemoryLayout(); }) != legalLayouts[op].end(); } -TTNNLayoutAttr L1InterleavedPolicy::getL1InterleavedLayout(Operation *op) { +TTNNLayoutAttr +GreedyL1InterleavedPolicy::getL1InterleavedLayout(Operation *op) { assert(hasL1BufferType(op)); auto l1InterleaveLayoutIter = std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), diff --git a/lib/Dialect/TTNN/Analysis/L1ChainConfig.cpp b/lib/Dialect/TTNN/Analysis/L1ChainConfig.cpp index 534f67a5fe..bf2eb9790a 100644 --- a/lib/Dialect/TTNN/Analysis/L1ChainConfig.cpp +++ b/lib/Dialect/TTNN/Analysis/L1ChainConfig.cpp @@ -50,4 +50,18 @@ void L1ChainConfig::complete( state = L1ChainState::Completed; } +void L1ChainConfig::complete() { + assert(state == L1ChainState::Resolved); + state = L1ChainState::Completed; +} + +void L1ChainConfig::merge(L1ChainConfig &other) { + assert(getState() == other.getState()); + opL1MemSpecs.insert(opL1MemSpecs.end(), other.opL1MemSpecs.begin(), + other.opL1MemSpecs.end()); + l1ChainedOps.insert(other.l1ChainedOps.begin(), other.l1ChainedOps.end()); + memReconfigEdges.insert(other.memReconfigEdges.begin(), + other.memReconfigEdges.end()); +} + } // namespace mlir::tt::ttnn diff --git a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp index f3db4ed7bf..ef7e3d1c11 100644 --- a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp +++ b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h" +#include "ttmlir/Dialect/TTNN/Analysis/BFInterleavedPolicy.h" #include "ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h" -#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" +#include "ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" namespace mlir::tt::ttnn { @@ -68,14 +69,22 @@ void MemoryLayoutAnalysis::analysisImplementation() { dfShardingPolicy.run(); break; } - case MemoryLayoutAnalysisPolicyType::L1Interleaved: { - L1InterleavedPolicy l1InterleavedPolicy( + case MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved: { + GreedyL1InterleavedPolicy l1InterleavedPolicy( op, l1ChainConfigs, filterDRAMAndL1Interleaved(analysisInput.legalLayouts), analysisResult.schedule, analysisInput.usableL1CacheSize); l1InterleavedPolicy.run(); break; } + case MemoryLayoutAnalysisPolicyType::BFInterleaved: { + BFInterleavedPolicy bfInterleavedPolicy( + op, l1ChainConfigs, + filterDRAMAndL1Interleaved(analysisInput.legalLayouts), + analysisResult.schedule, analysisInput.usableL1CacheSize); + bfInterleavedPolicy.run(); + break; + } } // Copy over default legal layouts. diff --git a/lib/Dialect/TTNN/Transforms/Passes.cpp b/lib/Dialect/TTNN/Transforms/Passes.cpp index 20172f4fd1..f35768d63d 100644 --- a/lib/Dialect/TTNN/Transforms/Passes.cpp +++ b/lib/Dialect/TTNN/Transforms/Passes.cpp @@ -4,6 +4,8 @@ #include "ttmlir/Dialect/TTNN/Transforms/Passes.h" +#include "ttmlir/Dialect/TT/IR/TTOps.h" +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h" @@ -12,6 +14,7 @@ #include "mlir/Analysis/Liveness.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeRange.h" @@ -24,6 +27,7 @@ namespace mlir::tt::ttnn { #define GEN_PASS_DEF_TTNNDEALLOCATE #define GEN_PASS_DEF_TTNNDECOMPOSELAYOUTS #define GEN_PASS_DEF_TTNNCREATEINPUTGENERATORS +#define GEN_PASS_DEF_TTNNMODIFYSIGNATURESFORDYLIB #include "ttmlir/Dialect/TTNN/Transforms/Passes.h.inc" class TTNNDeallocate : public impl::TTNNDeallocateBase { @@ -906,7 +910,7 @@ class TTNNCreateInputGenerators // Block *firstBlock = module.getBody(0); - // Find all the func.func ops in the module + // Find all the func.func ops in the module that are "forward" functions // SmallVector forwardFuncOps; for (mlir::Operation &op : firstBlock->getOperations()) { @@ -1065,4 +1069,96 @@ class TTNNCreateInputGenerators } }; +class TTNNModifySignaturesForDylib + : public impl::TTNNModifySignaturesForDylibBase< + TTNNModifySignaturesForDylib> { + +public: + using impl::TTNNModifySignaturesForDylibBase< + TTNNModifySignaturesForDylib>::TTNNModifySignaturesForDylibBase; + + void runOnOperation() final { + ModuleOp module = getOperation(); + IRRewriter rewriter(&getContext()); + + // Ensure that the module has a single region and a single block within that + // region + assert(module->getRegions().size() == 1); + assert(module->getRegion(0).getBlocks().size() == 1); + + // Get the first block of the region at index 0 + // + Block *firstBlock = module.getBody(0); + + // Find all the func.func ops in the module that are "forward" functions + // + SmallVector forwardFuncOps; + for (mlir::Operation &op : firstBlock->getOperations()) { + if (mlir::func::FuncOp funcOp = dyn_cast(op)) { + + // Skip functions that are called elsewhere in the IR + // + // This will skip utility functions that are used by other functions, + // only top-level "forward" functions should be considered + // + if (!funcOp->getUses().empty()) { + continue; + } + + forwardFuncOps.push_back(funcOp); + } + } + + // Iterate over all the func ops and modify the signatures + // + for (mlir::func::FuncOp forwardFuncOp : forwardFuncOps) { + // Replace the signature of the forward function so that all the tensor + // arguments are packed into a single tuple + // + mlir::FunctionType originalFuncType = forwardFuncOp.getFunctionType(); + assert( + std::all_of(originalFuncType.getInputs().begin(), + originalFuncType.getInputs().end(), + [](Type t) { return mlir::isa(t); }) && + "Expected all inputs must be of type RankedTensorType"); + mlir::TupleType inputTupleType = + mlir::TupleType::get(&getContext(), originalFuncType.getInputs()); + FunctionType tuplifiedFuncType = + originalFuncType.clone(inputTupleType, originalFuncType.getResults()); + rewriter.modifyOpInPlace(forwardFuncOp, + [&forwardFuncOp, &tuplifiedFuncType]() { + forwardFuncOp.setType(tuplifiedFuncType); + }); + + // First block of the function (often referred to as "entry block") needs + // its arguments updated as well - the args need to match the containing + // func's arguments; this is implemented here by first inserting the tuple + // as the first argument of the block, inserting GetTupleElementOp ops to + // start of the block in order to unpack tuple elements, and then + // replacing all uses of the original block arguments with the + // GetTupleElementOp results - after this it's finally safe to remove + // original block arguments as they have no live uses anymore + // + Block &entryBlock = forwardFuncOp.getBlocks().front(); + entryBlock.insertArgument(/*index=*/0u, + tuplifiedFuncType.getInputs().front(), + forwardFuncOp.getLoc()); + + rewriter.setInsertionPointToStart(&entryBlock); + for (size_t idx = 0; idx < originalFuncType.getInputs().size(); idx++) { + ::mlir::tt::GetTupleElementOp getTupleElementOp = + rewriter.create( + forwardFuncOp.getLoc(), forwardFuncOp.getArgument(0), idx); + + rewriter.replaceAllUsesWith(entryBlock.getArgument(1 + idx), + getTupleElementOp); + } + + // Erase original arguments + // + entryBlock.eraseArguments(1, originalFuncType.getInputs().size()); + } + } +}; + } // namespace mlir::tt::ttnn diff --git a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp index 80b76d6d45..e148b575fb 100644 --- a/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp +++ b/lib/Dialect/TTNN/Transforms/TTNNLayout.cpp @@ -2,7 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "ttmlir/Dialect/TT/Utils/OperandConstraints.h" +#include "ttmlir/Dialect/TT/IR/TT.h" +#include "ttmlir/Dialect/TTIR/IR/TTIROps.h" #include "ttmlir/Dialect/TTNN/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" diff --git a/lib/Dialect/TTNN/Utils/Utils.cpp b/lib/Dialect/TTNN/Utils/Utils.cpp index 751c714ede..6976dd35f4 100644 --- a/lib/Dialect/TTNN/Utils/Utils.cpp +++ b/lib/Dialect/TTNN/Utils/Utils.cpp @@ -117,4 +117,24 @@ createRankedTensorTypeWithEncoding(RankedTensorType tensorType, tensorType.getElementType(), encoding); } +uint64_t getOpOutputL1Usage(Operation *op, TTNNLayoutAttr opLayout, + DeviceAttr &deviceAttr) { + assert(mlir::isa(op->getResult(0).getType()) && + "L1 memory usage of the ops without output tensors cannot be " + "calculated."); + + // In case the opLayout is not in L1 memory space, L1 memory usage is 0. + // + if (opLayout.hasDRAMBufferType()) { + return 0; + } + + llvm::ArrayRef opOutputTensorShape = + mlir::cast(op->getResult(0).getType()).getShape(); + + uint64_t opL1OutputUsage = + opLayout.getTensorSizeInBytes(opOutputTensorShape, deviceAttr); + return opL1OutputUsage; +} + } // namespace mlir::tt::ttnn::utils diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index cbfc3bf95f..6f3c99345e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -75,6 +75,12 @@ declare_mlir_python_sources(TTMLIRPythonSources.Passes SOURCES passes.py ) +declare_mlir_python_sources(TTMLIRPythonSources.Util + ROOT_DIR "${TTMLIR_PYTHON_ROOT_DIR}" + ADD_TO_PARENT TTMLIRPythonSources + SOURCES util.py +) + declare_mlir_python_sources(TTMLIRPythonTestInfra.TestInfra ROOT_DIR "${TTMLIR_PYTHON_TEST_INFRA_ROOT_DIR}" ADD_TO_PARENT TTMLIRPythonTestInfra @@ -95,6 +101,7 @@ declare_mlir_python_extension(TTMLIRPythonExtensions.Main Overrides.cpp OptimizerOverrides.cpp Passes.cpp + Util.cpp EMBED_CAPI_LINK_LIBS MLIRCAPITransforms TTMLIRCAPI diff --git a/python/OptimizerOverrides.cpp b/python/OptimizerOverrides.cpp index bd5ce94f43..18806654c7 100644 --- a/python/OptimizerOverrides.cpp +++ b/python/OptimizerOverrides.cpp @@ -71,8 +71,10 @@ void populateOptimizerOverridesModule(py::module &m) { py::enum_( m, "MemoryLayoutAnalysisPolicyType") .value("DFSharding", mlir::tt::MemoryLayoutAnalysisPolicyType::DFSharding) - .value("L1Interleaved", - mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved); + .value("GreedyL1Interleaved", + mlir::tt::MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved) + .value("BFInterleaved", + mlir::tt::MemoryLayoutAnalysisPolicyType::BFInterleaved); py::enum_(m, "BufferType") .value("DRAM", mlir::tt::ttnn::BufferType::DRAM) diff --git a/python/TTMLIRModule.cpp b/python/TTMLIRModule.cpp index 0347da75b5..2805399205 100644 --- a/python/TTMLIRModule.cpp +++ b/python/TTMLIRModule.cpp @@ -43,4 +43,6 @@ PYBIND11_MODULE(_ttmlir, m) { auto optimizer_overrides = m.def_submodule( "optimizer_overrides", "Python-Bound Optimizer Overrides"); mlir::ttmlir::python::populateOptimizerOverridesModule(optimizer_overrides); + auto util = m.def_submodule("util", "Python-Bound Utilities & Helpers"); + mlir::ttmlir::python::populateUtilModule(util); } diff --git a/python/TTModule.cpp b/python/TTModule.cpp index 9b4bad5873..f64630bf19 100644 --- a/python/TTModule.cpp +++ b/python/TTModule.cpp @@ -397,24 +397,6 @@ void populateTTModule(py::module &m) { return static_cast(self.getValue()); }); - tt_attribute_class(m, "OperandConstraintAttr") - .def_static("get", - [](MlirContext ctx, uint32_t operandConstraint) { - return wrap(tt::OperandConstraintAttr::get( - unwrap(ctx), - static_cast(operandConstraint))); - }) - .def_static( - "get", - [](MlirContext ctx, std::vector attributesArray) { - return ::ttmlir::utils::wrapArrayOfMlirAttributesAsAttribute( - ctx, attributesArray); - }) - .def_property_readonly("operand_constraint_as_int", - [](tt::OperandConstraintAttr self) { - return static_cast(self.getValue()); - }); - tt_type_class(m, "DeviceType") .def_static( "get", diff --git a/python/Util.cpp b/python/Util.cpp new file mode 100644 index 0000000000..c562306bc3 --- /dev/null +++ b/python/Util.cpp @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttmlir/Bindings/Python/TTMLIRModule.h" + +namespace mlir::ttmlir::python { + +void populateUtilModule(py::module &m) { + m.def("debug_print_module", [](MlirModule module) { + std::string source; + llvm::raw_string_ostream os(source); + mlir::OpPrintingFlags flags; + flags.enableDebugInfo(); // Enable the debug print + auto *op = unwrap(mlirModuleGetOperation(module)); + op->print(os, flags); + return source; + }); + + m.def("get_loc_name", [](MlirLocation _loc) -> std::string { + mlir::Location loc = unwrap(_loc); + if (mlir::isa(loc)) { + mlir::NameLoc nameLoc = mlir::cast(loc); + return nameLoc.getName().str(); + } + return "-"; + }); + + m.def("get_loc_full", [](MlirLocation _loc) -> std::string { + mlir::Location loc = unwrap(_loc); + if (mlir::isa(loc)) { + mlir::FileLineColLoc fileLoc = mlir::cast(loc); + return fileLoc.getFilename().str() + ":" + + std::to_string(fileLoc.getLine()) + ":" + + std::to_string(fileLoc.getColumn()); + } + return "-"; + }); +} + +} // namespace mlir::ttmlir::python diff --git a/python/test_infra/test_utils.py b/python/test_infra/test_utils.py index 09e86db975..da1957b7f6 100644 --- a/python/test_infra/test_utils.py +++ b/python/test_infra/test_utils.py @@ -4,9 +4,8 @@ import os import inspect -from typing import Callable, Dict, List, Optional +from typing import Callable, List, Optional -import torch from ttmlir.dialects import func from ttmlir.ir import * from ttmlir.passes import ( diff --git a/python/test_infra/ttir_builder.py b/python/test_infra/ttir_builder.py index 9c832d014d..471c07ca7d 100644 --- a/python/test_infra/ttir_builder.py +++ b/python/test_infra/ttir_builder.py @@ -3,12 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import inspect +import inspect from dataclasses import dataclass from typing import List, Optional, Union, Tuple, Callable, Dict from ttmlir.ir import * -from ttmlir.dialects import ttir, tt, func, tensor +from ttmlir.dialects import ttir, tt, tensor from ttmlir.passes import create_golden_tensor, DataType import torch @@ -17,7 +17,50 @@ Operand = Union[Value, OpView, Operation] # Convenience alias for shape -Shape = Union[List[int], Tuple[int]] +Shape = Union[List[int], Tuple[int, ...]] + + +def get_loc_of_extra_file_callee(id: int = 0) -> Location: + """When called, this function returns a `Location` referring to first + callee outside the file of the caller of this function. E.G., if a function + in `foo.py` called a function in `bar.py` that then called this function, + the location would be pointing to the call in `foo.py`. + + NOTE: this location is _NOT_ in the form of + {filename}:{line_number}:{col_number}, but instead in the form: + {filename}:{line_number}:id({id}), where id is supplied to this function as + a disambiguator for calls that happen on the same line + + Arguments + --------- + + id : int + An optional variable that defaults to 0 to be appended to the location, + disambiguating calls on the same line. + + Returns + ------- + + A `Location` referring to the first extra file callee of the caller of this function + + """ + + stack = inspect.stack() + + # find the innermost frame outside of this file + caller_filename = stack[1].filename + + while len(stack) > 0 and stack[0].filename == caller_filename: + stack = stack[1:] + + assert ( + len(stack) > 0 + ), "Top of callstack to builder funcs must be outside the caller's file" + + # FIXME: this should be a `Location.file`, but for some reason it causes + # strange decomposition inheritance behaviour that breaks using this as + # a key into the golden map + return Location.name(f"{stack[0].filename}:{str(stack[0].lineno)}:id({str(id)})") @dataclass(frozen=True) @@ -251,40 +294,20 @@ def eltwise_proxy( inputs: List[Operand], ) -> OpView: - # Snoop the location of the first caller outside of this file to - # annotate the MLIR with. NOTE that this location is _NOT_ row:col, but - # instead row:id, where id is a unique id given to all calls to builder - # funcs. See `get_next_global_id` for more details - stack = inspect.stack() - - # find the innermost frame outside of this file - cur_filename = stack[0].filename - - while len(stack) > 0 and stack[0].filename == cur_filename: - stack = stack[1:] - - assert ( - len(stack) > 0 - ), "Top of callstack to builder funcs must be outside this file" + id = self.get_next_global_id() + loc = get_loc_of_extra_file_callee(id=id) with self._ctx, self._loc: output = self.empty(self.get_shape(inputs[0])) - id = self.get_next_global_id() - - op = op_ttir_function( - [self._get_type(output)], - inputs, - [output], - loc=Location.name(str(id)), - ) + op = op_ttir_function([self._get_type(output)], inputs, [output], loc=loc) goldens = [] for input in inputs: goldens.append(self._get_golden_tensor(input)) golden = Golden(op_golden_function(*goldens)) - self.id_golden_map[str(id)] = golden + self.id_golden_map[str(loc)] = golden self._store_golden(op, golden) self._override_golden(output, golden) diff --git a/python/ttmlir/util.py b/python/ttmlir/util.py new file mode 100644 index 0000000000..9bcbfd0856 --- /dev/null +++ b/python/ttmlir/util.py @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +from ._mlir_libs._ttmlir.util import * diff --git a/runtime/tools/python/ttrt/common/golden.py b/runtime/tools/python/ttrt/common/golden.py index 055d4c8249..847942615d 100644 --- a/runtime/tools/python/ttrt/common/golden.py +++ b/runtime/tools/python/ttrt/common/golden.py @@ -117,17 +117,7 @@ def golden_partial_function( print("-----------executing golden comparision-----------") try: - op_debug_str = ttrt.runtime.get_op_debug_str(op_context) - - # find matching golden tensor based on loc in op debug string - match = re.search(r"loc\(([^)]+)\)", op_debug_str) - - if not match: - print(f"debug_str={op_debug_str}") - print("No location found in debug string - skipping golden comparison") - return - - loc = match.group(1).replace('"', "") + loc = ttrt.runtime.get_op_loc_info(op_context) print(f"found location={loc}") op_golden_tensor = binary.get_debug_info_golden(loc) diff --git a/test/python/golden/test_ttir_ops.py b/test/python/golden/test_ttir_ops.py index aa18e10369..e693196f53 100644 --- a/test/python/golden/test_ttir_ops.py +++ b/test/python/golden/test_ttir_ops.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -# RUN: %python %s +# RUN: SYSTEM_DESC_PATH=%system_desc_path% %python %s import inspect diff --git a/test/python/smoketest.py b/test/python/smoketest.py index dfc324e381..88d871101e 100644 --- a/test/python/smoketest.py +++ b/test/python/smoketest.py @@ -12,7 +12,6 @@ module = Module.parse( """ - #any_device = #tt.operand_constraint %0 = tensor.empty() : tensor<64x128xf32> %1 = tensor.empty() : tensor<64x128xf32> %2 = tensor.empty() : tensor<64x128xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/binary/divide_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/binary/divide_op.mlir index 9505428888..bd8a698c7f 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/binary/divide_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/binary/divide_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_divice attributes {} { func.func public @test_divide(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.divide %arg0, %arg1 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/binary/multiply_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/binary/multiply_op.mlir index 390fc156b4..63d8773bd7 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/binary/multiply_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/binary/multiply_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_multiply attributes {} { func.func public @test_multiply(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.multiply %arg0, %arg1 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/binary/remainder_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/binary/remainder_op.mlir index bbca3a3f99..1d4a6a20c8 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/binary/remainder_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/binary/remainder_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_remainder attributes {} { func.func public @test_remainder(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>) -> tensor<32x32xf32> { %0 = stablehlo.remainder %arg0, %arg1 : tensor<32x32xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/binary/subtract_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/binary/subtract_op.mlir index 1be903064c..4003dbf73f 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/binary/subtract_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/binary/subtract_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_subtract attributes {} { func.func public @test_subtract(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.subtract %arg0, %arg1 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/convert_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/convert_op.mlir index 467325566a..ee2862cadd 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/convert_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/convert_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_convert attributes {} { func.func public @test_convert(%arg0: tensor<2x4xf32>) -> tensor<2x4xbf16> { %0 = stablehlo.convert %arg0 : (tensor<2x4xf32>) -> tensor<2x4xbf16> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir index 43241ac6f0..f92517600a 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/dynamic_iota_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_dnamic_iota attributes {} { func.func public @test_dynamic_iota() -> tensor<1x32x128x128xf32> { // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]] diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/floor_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/floor_op.mlir index 2df91d6c5e..90e99aca6a 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/floor_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/floor_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_floor attributes {} { func.func public @test_floor(%arg0: tensor<32x32x3xf32>) -> tensor<32x32x3xf32> { %0 = stablehlo.floor %arg0 : tensor<32x32x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir index e80bb75886..a9b4071edf 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/gather_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_gather attributes {} { func.func public @test_gather_0(%operand: tensor<32000x1024xf32>, %start_indices: tensor<1x32xi32>) -> tensor<1x32x1024xf32> { %0 = "stablehlo.gather"(%operand, %start_indices) <{dimension_numbers = #stablehlo.gather, indices_are_sorted = false, slice_sizes = array}> : (tensor<32000x1024xf32>, tensor<1x32xi32>) -> tensor<1x32x1024xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/get_dimension_size_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/get_dimension_size_op.mlir index 5e2a655388..f5df438612 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/get_dimension_size_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/get_dimension_size_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_get_dimension_size attributes {} { func.func public @test_get_dimension_size(%arg0: tensor<13x21x3xf32>) -> tensor { %0 = stablehlo.get_dimension_size %arg0, dim = 1 : (tensor<13x21x3xf32>) -> tensor diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir index 857a621bb0..d7620cd4a6 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/iota_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_iota attributes {} { func.func public @test_iota() -> tensor<1x32x128x128xf32> { // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]] diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/isfinite_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/isfinite_op.mlir index bdcef7a959..5f7c3d8359 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/isfinite_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/isfinite_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_isfinite attributes {} { func.func public @test_isfinite(%arg0: tensor<32x32x3xf32>) -> tensor<32x32x3xi1> { // CHECK: %[[E:.*]] = tensor.empty() : tensor<32x32x3xbf16> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/rsqrt_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/rsqrt_op.mlir index 13ceed52e9..f52bb72bf0 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/rsqrt_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/rsqrt_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_rsqrt attributes {} { func.func public @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.rsqrt %arg0 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/slice_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/slice_op.mlir index 5839f38652..c94acfb886 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/slice_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/slice_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_subtract attributes {} { func.func @slice_op(%arg0: tensor<32x64xf32>) -> tensor<8x8xf32> { // CHECK: %[[C:.*]] = tensor.empty[[C:.*]] diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/absolute_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/absolute_op.mlir index dbed57cd3b..1d43609f8f 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/unary/absolute_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/absolute_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_abs attributes {} { func.func public @test_abs(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.abs %arg0 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/cbrt_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/cbrt_op.mlir index 49eef5df7d..e754202468 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/unary/cbrt_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/cbrt_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_rsqrt attributes {} { func.func public @test_cbrt(%arg0: tensor<4xf64>) -> tensor<4xf64> { %0 = stablehlo.cbrt %arg0 : tensor<4xf64> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/exponential_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/exponential_op.mlir index 6b93c664c7..975652f49b 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/unary/exponential_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/exponential_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_exp attributes {} { func.func public @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.exponential %arg0 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/negate_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/negate_op.mlir index 65f491c038..68cfed1e9e 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/unary/negate_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/negate_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_neg attributes {} { func.func public @test_neg(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.negate %arg0 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Conversion/StableHLOToTTIR/unary/sqrt_op.mlir b/test/ttmlir/Conversion/StableHLOToTTIR/unary/sqrt_op.mlir index 72d2449b05..99023e472b 100644 --- a/test/ttmlir/Conversion/StableHLOToTTIR/unary/sqrt_op.mlir +++ b/test/ttmlir/Conversion/StableHLOToTTIR/unary/sqrt_op.mlir @@ -1,6 +1,5 @@ // REQUIRES: stablehlo // RUN: ttmlir-opt --stablehlo-to-ttir-pipeline %s | FileCheck %s -#any_device = #tt.operand_constraint module @jit_eltwise_sqrt attributes {} { func.func public @test_sqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { %0 = stablehlo.sqrt %arg0 : tensor<13x21x3xf32> diff --git a/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir b/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir index 6f72e56f17..6e5ef160c3 100644 --- a/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir +++ b/test/ttmlir/Dialect/TTIR/Decomposition/arange_decomposition.mlir @@ -1,5 +1,4 @@ // RUN: ttmlir-opt --ttir-to-ttir-decomposition %s | FileCheck %s -#any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<1x32x128x128xf32>) -> tensor<1x32x128x128xf32> { // CHECK: %[[C:.*]] = "ttir.arange"[[C:.*]] diff --git a/test/ttmlir/Dialect/TTIR/convolution/convolution_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/convolution/convolution_tests_negative.mlir index 278bb9f211..6f960f0dda 100644 --- a/test/ttmlir/Dialect/TTIR/convolution/convolution_tests_negative.mlir +++ b/test/ttmlir/Dialect/TTIR/convolution/convolution_tests_negative.mlir @@ -28,7 +28,6 @@ module @jit_convolution_bad_spatial_dimensions { } // ----- -#any_device_tile = #tt.operand_constraint module @jit_convolution_bad_stride_dimensions { func.func public @test_illegal_convolution(%arg0: tensor<1x3x100x100xbf16>, %arg1: tensor<7x3x3x3xbf16>) -> tensor<1x7x100x100xbf16> { %0 = tensor.empty() : tensor<1x7x100x100xbf16> @@ -58,7 +57,6 @@ module @jit_convolution_bad_stride_dimensions { } // ----- -#any_device_tile = #tt.operand_constraint module @jit_convolution_bad_input_tensor { func.func public @test_illegal_convolution(%arg0: tensor<1x3x100x100x100xbf16>, %arg1: tensor<7x3x3x3xbf16>) -> tensor<1x7x100x100xbf16> { %0 = tensor.empty() : tensor<1x7x100x100xbf16> @@ -88,7 +86,6 @@ module @jit_convolution_bad_input_tensor { } // ----- -#any_device_tile = #tt.operand_constraint module @jit_convolution_bad_weight_tensor { func.func public @test_illegal_convolution(%arg0: tensor<1x3x100x100xbf16>, %arg1: tensor<20x7x3x3x3xbf16>) -> tensor<1x7x100x100xbf16> { %0 = tensor.empty() : tensor<1x7x100x100xbf16> @@ -118,7 +115,6 @@ module @jit_convolution_bad_weight_tensor { } // ----- -#any_device_tile = #tt.operand_constraint module @jit_convolution_bad_bias_tensor { func.func public @test_illegal_convolution(%arg0: tensor<1x3x100x100xbf16>, %arg1: tensor<7x3x3x3xbf16>, %arg2: tensor<1x1x7xbf16>) -> tensor<1x7x100x100xbf16> { %0 = tensor.empty() : tensor<1x7x100x100xbf16> diff --git a/test/ttmlir/Dialect/TTIR/index/index_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/index/index_tests_negative.mlir index 9f5d8b04ae..6b1d5faee3 100644 --- a/test/ttmlir/Dialect/TTIR/index/index_tests_negative.mlir +++ b/test/ttmlir/Dialect/TTIR/index/index_tests_negative.mlir @@ -13,7 +13,6 @@ module attributes {} { // Verify that the parsing fails if the dim is not in the rank range of the input tensor // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_begins(%arg0: tensor<3x128x64xbf16>) -> tensor<3x128x64xbf16> { %0 = tensor.empty() : tensor<3x128x64xbf16> @@ -25,7 +24,6 @@ module attributes {} { // Verify that the parsing fails if the output type is not equal to the input tensor type // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_output_datatype(%arg0: tensor<3x128x64xbf16>) -> tensor<3x128x32xf32> { %0 = tensor.empty() : tensor<3x128x32xf32> @@ -37,7 +35,6 @@ module attributes {} { // Verify that the parsing fails if the output rank is not equal to the input tensor rank // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_input_output_rank_missmatch(%arg0: tensor<3x128x64xbf16>) -> tensor<3x64x64x1xbf16> { %0 = tensor.empty() : tensor<3x64x64x1xbf16> @@ -49,7 +46,6 @@ module attributes {} { // Verify that the parsing fails if the begin value exceeds positive limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_begin_positive(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x1x128x64xbf16> { %0 = tensor.empty() : tensor<10x1x128x64xbf16> @@ -61,7 +57,6 @@ module attributes {} { // Verify that the parsing fails if the begin value exceeds negative limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_begin_negative(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x64x64xbf16> { %0 = tensor.empty() : tensor<10x3x64x64xbf16> @@ -73,7 +68,6 @@ module attributes {} { // Verify that the parsing fails if the end value exceeds positive limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_end_positive(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x64xbf16> { %0 = tensor.empty() : tensor<10x3x128x64xbf16> @@ -85,7 +79,6 @@ module attributes {} { // Verify that the parsing fails if the end value exceeds positive limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_end_negative(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x64xbf16> { %0 = tensor.empty() : tensor<10x3x128x64xbf16> @@ -97,7 +90,6 @@ module attributes {} { // Verify that the parsing fails if the step value is equal to zero // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_step_is_zero(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x64xbf16> { %0 = tensor.empty() : tensor<10x3x128x64xbf16> @@ -109,7 +101,6 @@ module attributes {} { // Verify that the parsing fails if the begin index is greater than end and step is positive // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_begin_greater_than_end_positive_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x64xbf16> { %0 = tensor.empty() : tensor<10x3x128x64xbf16> @@ -121,7 +112,6 @@ module attributes {} { // Verify that the parsing fails if the end index is greater than begin and step is negative // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_begin_less_than_end_negative_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x64xbf16> { %0 = tensor.empty() : tensor<10x3x128x64xbf16> @@ -133,7 +123,6 @@ module attributes {} { // Verify that the parsing fails if there is missmatch in output dimension // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @index_negative_invalid_output_shape(%arg0: tensor<10x3x128x64xbf16>) -> tensor<10x3x128x32xbf16> { %0 = tensor.empty() : tensor<10x3x128x32xbf16> diff --git a/test/ttmlir/Dialect/TTIR/matmul/matmul_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/matmul/matmul_tests_negative.mlir index f15379c8f7..d6c20b0ac5 100644 --- a/test/ttmlir/Dialect/TTIR/matmul/matmul_tests_negative.mlir +++ b/test/ttmlir/Dialect/TTIR/matmul/matmul_tests_negative.mlir @@ -12,7 +12,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_1d_1d_inner_dimension_missmatch(%arg0: tensor<128xbf16>, %arg1: tensor) -> tensor<1xbf16> { // CHECK: error: 'ttir.matmul' op Input B must be at least a 1D tensor @@ -24,7 +23,6 @@ module attributes {} { // Verify that the parsing fails if the output is a scalar // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_1d_1d_inner_dimension_missmatch(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>) -> tensor { // CHECK: error: 'ttir.matmul' op Scalar output is not supported, output must be at least a 1D tensor @@ -35,7 +33,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_1d_1d_inner_dimension_missmatch(%arg0: tensor<128xbf16>, %arg1: tensor<128xbf16>) -> tensor<2xbf16> { // CHECK: error: 'ttir.matmul' op Scalar output must be a 1D tensor of size 1 @@ -47,7 +44,6 @@ module attributes {} { // Inner dimension mismatch tests // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_1d_1d_inner_dimension_missmatch(%arg0: tensor<128xbf16>, %arg1: tensor<64xbf16>) -> tensor<1xbf16> { // CHECK: error: 'ttir.matmul' op Input A[-1](128) and B[-2](64) must have matching inner dimensions @@ -58,7 +54,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_1d_2d_inner_dimension_missmatch(%arg0: tensor<64xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64xbf16> { // CHECK: error: 'ttir.matmul' op Input A[-1](64) and B[-2](128) must have matching inner dimensions @@ -69,7 +64,6 @@ func.func @matmul_negative_1d_2d_inner_dimension_missmatch(%arg0: tensor<64xbf16 } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_2d_1d_inner_dimension_missmatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<64xbf16>) -> tensor<64xbf16> { // CHECK: error: 'ttir.matmul' op Input A[-1](128) and B[-2](64) must have matching inner dimensions @@ -80,7 +74,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_2d_2d_inner_dimension_missmatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x64xbf16> { // CHECK: error: 'ttir.matmul' op Input A[-1](128) and B[-2](64) must have matching inner dimensions @@ -91,7 +84,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_nd_nd_inner_dimension_missmatch(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<1x64x128xbf16>) -> tensor<7x64x64xbf16> { // CHECK: error: 'ttir.matmul' op Input A[-1](128) and B[-2](64) must have matching inner dimensions @@ -103,7 +95,6 @@ module attributes {} { // Batch dimension mismatch tests // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_nd_nd_same_rank_batch_broadcast_incompatible_1(%arg0: tensor<7x64x128xbf16>, %arg1: tensor<2x128x64xbf16>) -> tensor<7x64x64xbf16> { // CHECK: error: 'ttir.matmul' op Batch dimensions of input A(7) and B(2) are not broadcast compatible @@ -114,7 +105,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_nd_nd_same_rank_batch_broadcast_incompatible_2(%arg0: tensor<2x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<7x7x64x64xbf16> { // CHECK: error: 'ttir.matmul' op Batch dimensions of input A(2,7) and B(7,1) are not broadcast compatible @@ -125,7 +115,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_nd_nd_different_rank_batch_broadcast_incompatible(%arg0: tensor<12x2x7x64x128xbf16>, %arg1: tensor<7x1x128x64xbf16>) -> tensor<12x7x7x64x64xbf16> { // CHECK: error: 'ttir.matmul' op Batch dimensions of input A(12,2,7) and B(7,1) are not broadcast compatible @@ -137,7 +126,6 @@ module attributes {} { // Output shape mismatch tests // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_2d_2d_inner_dimension_missmatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64xbf16> { // CHECK: error: 'ttir.matmul' op Output shape rank(1) must match the expected output shape rank(2) @@ -148,7 +136,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @matmul_negative_2d_2d_inner_dimension_missmatch(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x64xbf16>) -> tensor<64x128xbf16> { // CHECK: error: 'ttir.matmul' op Output shape dimension[1](128) doesn't match the expected output shape dimension[1](64) diff --git a/test/ttmlir/Dialect/TTIR/slice/slice_tests_negative.mlir b/test/ttmlir/Dialect/TTIR/slice/slice_tests_negative.mlir index db444258ec..5e7eaf43c6 100644 --- a/test/ttmlir/Dialect/TTIR/slice/slice_tests_negative.mlir +++ b/test/ttmlir/Dialect/TTIR/slice/slice_tests_negative.mlir @@ -13,7 +13,6 @@ module attributes {} { // Verify that the parsing fails if the begins size is not equal to the input tensor rank // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_begins(%arg0: tensor<3x128x64xbf16>) -> tensor<1x64x64xbf16> { %0 = tensor.empty() : tensor<1x64x64xbf16> @@ -25,7 +24,6 @@ module attributes {} { // Verify that the parsing fails if the ends size is not equal to the input tensor rank // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_ends(%arg0: tensor<3x128x64xbf16>) -> tensor<1x64x64xbf16> { %0 = tensor.empty() : tensor<1x64x64xbf16> @@ -37,7 +35,6 @@ module attributes {} { // Verify that the parsing fails if the step size is not equal to the input tensor rank // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_step(%arg0: tensor<3x128x64xbf16>) -> tensor<1x64x64xbf16> { %0 = tensor.empty() : tensor<1x64x64xbf16> @@ -49,7 +46,6 @@ module attributes {} { // Verify that the parsing fails if the output type is not equal to the input tensor type // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_output_datatype(%arg0: tensor<3x128x64xbf16>) -> tensor<1x64x64xf32> { %0 = tensor.empty() : tensor<1x64x64xf32> @@ -61,7 +57,6 @@ module attributes {} { // Verify that the parsing fails if the output rank is not equal to the input tensor rank // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_input_output_rank_missmatch(%arg0: tensor<3x128x64xbf16>) -> tensor<1x1x64x64xbf16> { %0 = tensor.empty() : tensor<1x1x64x64xbf16> @@ -73,7 +68,6 @@ module attributes {} { // Verify that the parsing fails if the begin value exceeds positive limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_begin_positive(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -85,7 +79,6 @@ module attributes {} { // Verify that the parsing fails if the begin value exceeds negative limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_begin_negative(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -97,7 +90,6 @@ module attributes {} { // Verify that the parsing fails if the end value exceeds positive limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_end_positive(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -109,7 +101,6 @@ module attributes {} { // Verify that the parsing fails if the end value exceeds negative limit // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_end_negative(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -121,7 +112,6 @@ module attributes {} { // Verify that the parsing fails if the step value is equal to zero // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_step_is_zero(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -133,7 +123,6 @@ module attributes {} { // Verify that the parsing fails if the begin index is greater than end and step is positive // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_begin_greater_than_end_positive_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -144,7 +133,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_begin_greater_than_end_positive_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x8x8xbf16> { %0 = tensor.empty() : tensor<4x1x8x8xbf16> @@ -156,7 +144,6 @@ module attributes {} { // Verify that the parsing fails if the end index is greater than begin and step is negative // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_begin_less_than_end_negative_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x8xbf16> { %0 = tensor.empty() : tensor<4x1x16x8xbf16> @@ -167,7 +154,6 @@ module attributes {} { } // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_begin_less_than_end_negative_step(%arg0: tensor<10x3x128x64xbf16>) -> tensor<5x1x16x8xbf16> { %0 = tensor.empty() : tensor<5x1x16x8xbf16> @@ -179,7 +165,6 @@ module attributes {} { // Verify that the parsing fails if there is missmatch in output dimension // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @slice_negative_invalid_output_shape(%arg0: tensor<10x3x128x64xbf16>) -> tensor<4x1x16x16xbf16> { %0 = tensor.empty() : tensor<4x1x16x16xbf16> diff --git a/test/ttmlir/Dialect/TTNN/Transforms/ttnn_modify_signatures_for_dylib_0.mlir b/test/ttmlir/Dialect/TTNN/Transforms/ttnn_modify_signatures_for_dylib_0.mlir new file mode 100644 index 0000000000..f7cab85905 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/Transforms/ttnn_modify_signatures_for_dylib_0.mlir @@ -0,0 +1,12 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" --ttnn-modify-signatures-for-dylib %s | FileCheck %s + +module attributes {} { + // CHECK: func.func @add(%arg0: tuple<[[TENSOR_A:.*>]], [[TENSOR_B:.*>]]>) -> tensor<32x32xbf16, #ttnn_layout> { + func.func @add(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>) -> tensor<32x32xbf16> { + // CHECK-NEXT: %0 = tt.get_tuple_element %arg0[0] : (tuple<[[TENSOR_A]], [[TENSOR_B]]>) -> [[TENSOR_A]] + // CHECK-NEXT: %1 = tt.get_tuple_element %arg0[1] : (tuple<[[TENSOR_A]], [[TENSOR_B]]>) -> [[TENSOR_B]] + %0 = tensor.empty() : tensor<32x32xbf16> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %1 : tensor<32x32xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/ccl/all_reduce.mlir b/test/ttmlir/Dialect/TTNN/ccl/all_reduce.mlir index 1eaf04df7f..5fa77776be 100644 --- a/test/ttmlir/Dialect/TTNN/ccl/all_reduce.mlir +++ b/test/ttmlir/Dialect/TTNN/ccl/all_reduce.mlir @@ -17,7 +17,6 @@ module attributes {} { // ----- // Verify lowering of ttir all_reduce to ttnn ops -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @all_reduce(%arg0: tensor<1x1x4096x16384xf32>) -> tensor<1x1x4096x16384xf32> { %0 = tensor.empty() : tensor<1x1x4096x16384xf32> diff --git a/test/ttmlir/Dialect/TTNN/clamp/clamp_tests_negative.mlir b/test/ttmlir/Dialect/TTNN/clamp/clamp_tests_negative.mlir index f97dde2ba1..67eb3d2116 100644 --- a/test/ttmlir/Dialect/TTNN/clamp/clamp_tests_negative.mlir +++ b/test/ttmlir/Dialect/TTNN/clamp/clamp_tests_negative.mlir @@ -2,7 +2,6 @@ // Negative tests for matmul operation // Verify that the parsing fails if input and output shapes do not match. -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @clamp(%arg0: tensor<64x64xbf16>) -> tensor<64x128xbf16> { // CHECK: error: 'ttnn.clamp' op input and output must have same shape. @@ -14,7 +13,6 @@ module attributes {} { // Verify that parsing fails in case of more than one input. // ----- -#any_device_tile = #tt.operand_constraint module attributes {} { func.func @clamp2(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x128xbf16> { // CHECK: error: 'ttnn.clamp' op expects one tensor as input. diff --git a/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_buffer_type.mlir b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_buffer_type.mlir new file mode 100644 index 0000000000..bdce3fd0f9 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_buffer_type.mlir @@ -0,0 +1,13 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=BFInterleaved" %s | FileCheck %s +// XFAIL: * +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<6144x6144xbf16>, %arg1: tensor<6144x6144xbf16>, %arg2: tensor<6144x6144xbf16>) -> tensor<6144x6144xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + %0 = tensor.empty() : tensor<6144x6144xbf16> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<6144x6144xbf16>, tensor<6144x6144xbf16>, tensor<6144x6144xbf16>) -> tensor<6144x6144xbf16> + %2 = tensor.empty() : tensor<6144x6144xbf16> + %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<6144x6144xbf16>, tensor<6144x6144xbf16>, tensor<6144x6144xbf16>) -> tensor<6144x6144xbf16> + return %3 : tensor<6144x6144xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_operands_l1_op.mlir b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_operands_l1_op.mlir new file mode 100644 index 0000000000..ec809a60a7 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_dram_operands_l1_op.mlir @@ -0,0 +1,19 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=BFInterleaved" %s | FileCheck %s +module attributes {} { + func.func @forward(%arg0: tensor<5120x8192xbf16>, %arg1: tensor<8192x5120xbf16>) -> tensor<5120x5120xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<32x20x!tt.tile<32x32, bf16>, #dram>, > + // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<20x32x!tt.tile<32x32, bf16>, #dram>, > + // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<20x20x!tt.tile<32x32, bf16>, #l1_>, > + %0 = tensor.empty() : tensor<5120x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<5120x8192xbf16, #[[LAYOUT_6]]> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<5120x8192xbf16>, tensor<5120x8192xbf16>) -> tensor<5120x8192xbf16> + %2 = tensor.empty() : tensor<8192x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<8192x5120xbf16, #[[LAYOUT_5]]> + %3 = "ttir.relu"(%arg1, %2) <{operandSegmentSizes = array}> : (tensor<8192x5120xbf16>, tensor<8192x5120xbf16>) -> tensor<8192x5120xbf16> + %4 = tensor.empty() : tensor<5120x5120xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_7]]> + %5 = "ttir.matmul"(%1, %3, %4) : (tensor<5120x8192xbf16>, tensor<8192x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> + return %5 : tensor<5120x5120xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_l1_operands_dram_op.mlir b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_l1_operands_dram_op.mlir new file mode 100644 index 0000000000..0460f6ac47 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/all_l1_operands_dram_op.mlir @@ -0,0 +1,19 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=BFInterleaved" %s | FileCheck %s +module attributes {} { + func.func @forward(%arg0: tensor<6144x1024xbf16>, %arg1: tensor<1024x6144xbf16>) -> tensor<6144x6144xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<24x4x!tt.tile<32x32, bf16>, #l1_>, > + // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<4x24x!tt.tile<32x32, bf16>, #l1_>, > + // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<24x24x!tt.tile<32x32, bf16>, #dram>, > + %0 = tensor.empty() : tensor<6144x1024xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<6144x1024xbf16, #[[LAYOUT_5]]> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<6144x1024xbf16>, tensor<6144x1024xbf16>) -> tensor<6144x1024xbf16> + %2 = tensor.empty() : tensor<1024x6144xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<1024x6144xbf16, #[[LAYOUT_6]]> + %3 = "ttir.relu"(%arg1, %2) <{operandSegmentSizes = array}> : (tensor<1024x6144xbf16>, tensor<1024x6144xbf16>) -> tensor<1024x6144xbf16> + %4 = tensor.empty() : tensor<6144x6144xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<6144x6144xbf16, #[[LAYOUT_7]]> + %5 = "ttir.matmul"(%1, %3, %4) : (tensor<6144x1024xbf16>, tensor<1024x6144xbf16>, tensor<6144x6144xbf16>) -> tensor<6144x6144xbf16> + return %5 : tensor<6144x6144xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_01.mlir b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_01.mlir new file mode 100644 index 0000000000..5446082c75 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_01.mlir @@ -0,0 +1,35 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=BFInterleaved" %s | FileCheck %s +// +// A +// | +// B +// / \ +// C D +// \ / +// E +// | +// F +// +// There is enough L1 memory to schedule this fork-join but only if we allocate +// the output tensor of the op B once. +// +module attributes {} { + func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<5120x1024xbf16>, %arg2: tensor<5120x1024xbf16>) -> tensor<4096x1024xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<16x20x!tt.tile<32x32, bf16>, #l1_>, > + // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<16x4x!tt.tile<32x32, bf16>, #l1_>, > + %0 = tensor.empty() : tensor<4096x5120xbf16> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_5]]> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16> + %2 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_6]]> + %3 = "ttir.matmul"(%1, %arg1, %2) : (tensor<4096x5120xbf16>, tensor<5120x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + %4 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_6]]> + %5 = "ttir.matmul"(%1, %arg2, %4) : (tensor<4096x5120xbf16>, tensor<5120x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + %6 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_6]]> + %7 = "ttir.add"(%3, %5, %6) <{operandSegmentSizes = array}> : (tensor<4096x1024xbf16>, tensor<4096x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + return %7 : tensor<4096x1024xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_02.mlir b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_02.mlir new file mode 100644 index 0000000000..ee44b78c21 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/bf_interleaved_policy/fork_join_02.mlir @@ -0,0 +1,42 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=BFInterleaved" %s | FileCheck %s +// +// A +// | +// B +// / \ +// C D +// | | +// E | +// \ / +// F +// | +// G +// +// There is not enough L1 memory to schedule this fork-join even if we allocate +// the output tensor of the op B once becuase the output tensor of the op C is +// too large to fit in L1 on its own. +// +module attributes {} { + func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<5120x9216xbf16>, %arg2: tensor<9216x1024xbf16>, %arg3: tensor<5120x1024xbf16>) -> tensor<4096x1024xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK: #[[LAYOUT_9:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<16x20x!tt.tile<32x32, bf16>, #l1_>, > + // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<16x36x!tt.tile<32x32, bf16>, #dram>, > + // CHECK: #[[LAYOUT_11:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8, (d0, d1) -> (0, d0, d1)>, memref<16x4x!tt.tile<32x32, bf16>, #l1_>, > + %0 = tensor.empty() : tensor<4096x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_9]]> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16> + %2 = tensor.empty() : tensor<4096x9216xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x9216xbf16, #[[LAYOUT_10]]> + %3 = "ttir.matmul"(%1, %arg1, %2) : (tensor<4096x5120xbf16>, tensor<5120x9216xbf16>, tensor<4096x9216xbf16>) -> tensor<4096x9216xbf16> + %4 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_11]]> + %5 = "ttir.matmul"(%3, %arg2, %4) : (tensor<4096x9216xbf16>, tensor<9216x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + %6 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_11]]> + %7 = "ttir.matmul"(%1, %arg3, %6) : (tensor<4096x5120xbf16>, tensor<5120x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + %8 = tensor.empty() : tensor<4096x1024xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x1024xbf16, #[[LAYOUT_11]]> + %9 = "ttir.add"(%5, %7, %8) <{operandSegmentSizes = array}> : (tensor<4096x1024xbf16>, tensor<4096x1024xbf16>, tensor<4096x1024xbf16>) -> tensor<4096x1024xbf16> + return %9 : tensor<4096x1024xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/all_l1_interleaved_policy.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/all_l1_interleaved_policy.mlir index a895ca25ef..eaa16fb268 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/all_l1_interleaved_policy.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<64x96xbf16>, %arg3: tensor<96x32xbf16>, %arg4: tensor<64x32xbf16>) -> tensor<64x32xbf16> { // CHECK: #[[L1_:.*]] = #ttnn.buffer_type diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/fork_join.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/fork_join.mlir index d9336db8a8..657da93390 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/fork_join.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A // | diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/mnist_l1_interleaved.mlir similarity index 98% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/mnist_l1_interleaved.mlir index 3d437a74fa..d1a32308ff 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/mnist_l1_interleaved.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s #loc = loc("MNISTLinear":4294967295:0) module @"tt-forge-graph" attributes {} { func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> { diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir similarity index 96% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir index ecd90f1abb..436518ce0f 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir index 056ded8d35..8f018f9515 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir index caaf3254d8..0791c46295 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir index 63cd3bcaa2..049f8f0b45 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir index 9f12e8b6f6..0a63866a63 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir index c594ca4182..c75c2f39c7 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir similarity index 97% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir index eb2a51b174..635540ea61 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir similarity index 96% rename from test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir index 883842694f..1d591ef2b2 100644 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/greedy_l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s | FileCheck %s // // A B // \ / diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir deleted file mode 100644 index 7b8aa07593..0000000000 --- a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s -// UNSUPPORTED: true -module attributes {} { - func.func @forward(%arg0: tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> { - %0 = tensor.empty() : tensor<5120x5120xbf16> - %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> - return %1 : tensor<5120x5120xbf16> - } -} diff --git a/test/ttmlir/Silicon/TTNN/ones.mlir b/test/ttmlir/Silicon/TTNN/ones.mlir index 249530700b..660de36ae1 100644 --- a/test/ttmlir/Silicon/TTNN/ones.mlir +++ b/test/ttmlir/Silicon/TTNN/ones.mlir @@ -2,7 +2,6 @@ // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn -#any_device = #tt.operand_constraint module { func.func @ones_2d() -> tensor<32x128xbf16> { // CHECK: {{.*}} = "ttnn.ones"() {{.*}} diff --git a/test/ttmlir/Silicon/TTNN/optimizer/simple_fork_join.mlir b/test/ttmlir/Silicon/TTNN/optimizer/simple_fork_join.mlir index e323e10249..b6b3c4d686 100644 --- a/test/ttmlir/Silicon/TTNN/optimizer/simple_fork_join.mlir +++ b/test/ttmlir/Silicon/TTNN/optimizer/simple_fork_join.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=GreedyL1Interleaved" %s > %t.mlir // RUN: FileCheck %s --input-file=%t.mlir // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn // UNSUPPORTED: true diff --git a/test/unittests/Optimizer/CMakeLists.txt b/test/unittests/Optimizer/CMakeLists.txt index b05c8ae294..cf11c479fd 100644 --- a/test/unittests/Optimizer/CMakeLists.txt +++ b/test/unittests/Optimizer/CMakeLists.txt @@ -1,7 +1,7 @@ add_mlir_unittest(OptimizerTests TestShardSolver.cpp TestOptimizerOverrides.cpp - TestL1InterleavedPolicy.cpp + TestGreedyL1InterleavedPolicy.cpp ) target_link_libraries(OptimizerTests diff --git a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp b/test/unittests/Optimizer/TestGreedyL1InterleavedPolicy.cpp similarity index 93% rename from test/unittests/Optimizer/TestL1InterleavedPolicy.cpp rename to test/unittests/Optimizer/TestGreedyL1InterleavedPolicy.cpp index b09b65245d..3bc0c54410 100644 --- a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp +++ b/test/unittests/Optimizer/TestGreedyL1InterleavedPolicy.cpp @@ -16,14 +16,14 @@ #include "ttmlir/Dialect/TTNN/IR/TTNN.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" -#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" +#include "ttmlir/Dialect/TTNN/Analysis/GreedyL1InterleavedPolicy.h" using namespace mlir::tt::ttnn; constexpr int TensorDimX = 128; constexpr int TensorDimY = 128; -class L1InterleavedPolicyBase : public ::testing::Test { +class GreedyL1InterleavedPolicyBase : public ::testing::Test { public: mlir::MLIRContext context; mlir::OwningOpRef module; @@ -31,9 +31,9 @@ class L1InterleavedPolicyBase : public ::testing::Test { mlir::func::FuncOp func; mlir::tt::DeviceAttr deviceAttr; - using OpMemSpec = L1InterleavedPolicy::OpMemSpec; - using OpConfig = L1InterleavedPolicy::OpConfig; - using L1Usage = L1InterleavedPolicy::L1Usage; + using OpMemSpec = GreedyL1InterleavedPolicy::OpMemSpec; + using OpConfig = GreedyL1InterleavedPolicy::OpConfig; + using L1Usage = GreedyL1InterleavedPolicy::L1Usage; void SetUp() override { context.loadDialect(); @@ -121,7 +121,7 @@ class L1InterleavedPolicyBase : public ::testing::Test { void TearDown() override {} }; -TEST_F(L1InterleavedPolicyBase, VerifyGreedyPolicy) { +TEST_F(GreedyL1InterleavedPolicyBase, VerifyGreedyPolicy) { std::vector l1ChainConfigs; llvm::DenseMap> legalLayouts; llvm::DenseMap> @@ -174,8 +174,8 @@ TEST_F(L1InterleavedPolicyBase, VerifyGreedyPolicy) { legalLayouts, opsL1Usage); // Run greedy config picker policy - L1InterleavedPolicy l1InterleavedPolicy(nullptr, l1ChainConfigs, legalLayouts, - schedule, usableL1CacheSize); + GreedyL1InterleavedPolicy l1InterleavedPolicy( + nullptr, l1ChainConfigs, legalLayouts, schedule, usableL1CacheSize); OpConfig greedyConfig = l1InterleavedPolicy.getGreedyConfig(opD, opsL1Usage); // Sanity checks diff --git a/test/unittests/Optimizer/TestOptimizerOverrides.cpp b/test/unittests/Optimizer/TestOptimizerOverrides.cpp index 31118262f5..fee0796505 100644 --- a/test/unittests/Optimizer/TestOptimizerOverrides.cpp +++ b/test/unittests/Optimizer/TestOptimizerOverrides.cpp @@ -356,9 +356,9 @@ TEST_F(TestOptimizerOverrideHandler, TestSetMemoryLayoutAnalysisPolicy) { mlir::tt::MemoryLayoutAnalysisPolicyType::DFSharding); optimizerOverridesHandler.setMemoryLayoutAnalysisPolicy( - mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved); + mlir::tt::MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved); ASSERT_EQ(optimizerOverridesHandler.getMemoryLayoutAnalysisPolicy(), - mlir::tt::MemoryLayoutAnalysisPolicyType::L1Interleaved); + mlir::tt::MemoryLayoutAnalysisPolicyType::GreedyL1Interleaved); } // Test the setInputLayoutOverrides method diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 54a1563c4f..e49dc0d8c9 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -1,6 +1,6 @@ include(ExternalProject) -set(TT_METAL_VERSION "2f59d5e02ef0da955b99cf0f49da1759c772a67a") +set(TT_METAL_VERSION "388e187b27f8924ea09512719f666407c40f6bd9") if ("$ENV{ARCH_NAME}" STREQUAL "grayskull") set(ARCH_NAME "grayskull") @@ -34,6 +34,7 @@ set(TTMETAL_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/fmt/73b5ec45edbd92babfd91c3777a9e1ab9cac8238/include ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/magic_enum/1e1af177d4ab0ef660f105434fd1017c4d1f8c17/include/magic_enum ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/boost_core/e679bef5c160cf29d0f37d549881dc5f5a58c332/include + ${PROJECT_SOURCE_DIR}/third_party/tt-metal/src/tt-metal/.cpmcache/json/230202b6f5267cbf0c8e5a2f17301964d95f83ff/include PARENT_SCOPE ) diff --git a/tools/explorer/CMakeLists.txt b/tools/explorer/CMakeLists.txt index e0128691a1..3879558545 100644 --- a/tools/explorer/CMakeLists.txt +++ b/tools/explorer/CMakeLists.txt @@ -3,7 +3,7 @@ include(ExternalProject) set(TT_EXPLORER_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/run.py) set(TTMLIR_BUILD_BIN_DIR ${TTMLIR_BINARY_DIR}/bin) -set(MODEL_EXPLORER_VERSION "d0b53c3b7049fd41ea1caff193706272c399fac9") +set(MODEL_EXPLORER_VERSION "ca884d5eb3291507e7f4e76776957e231b2d9b6d") ExternalProject_Add( model-explorer PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/model-explorer @@ -18,7 +18,9 @@ ExternalProject_Add( add_custom_target(explorer COMMENT "Building tt-explorer... ${TTMLIR_BIN_DIR}" COMMAND pip install $<$:-e> ${CMAKE_CURRENT_SOURCE_DIR}/tt_adapter - COMMAND pip install ${CMAKE_CURRENT_SOURCE_DIR}/model-explorer/src/model-explorer/src/server/package + COMMAND pip uninstall -y ai-edge-model-explorer + COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_SOURCE_DIR}/model-explorer/src/model-explorer/src/server/package/build + COMMAND pip install --upgrade ${CMAKE_CURRENT_SOURCE_DIR}/model-explorer/src/model-explorer/src/server/package DEPENDS TTMLIRPythonModules model-explorer ttrt ttmlir-opt ttmlir-translate ) diff --git a/tools/explorer/test/run_tests.py b/tools/explorer/test/run_tests.py index 75925a44e2..485104fbbf 100644 --- a/tools/explorer/test/run_tests.py +++ b/tools/explorer/test/run_tests.py @@ -124,7 +124,7 @@ def test_execute_model(model_path): def test_execute_mnist_l1_interleaved(): execute_command_and_wait( "test/ttmlir/Silicon/TTNN/optimizer/mnist_sharding.mlir", - {"optimizationPolicy": "L1 Interleaved"}, + {"optimizationPolicy": "Greedy L1 Interleaved"}, timeout=60, ) diff --git a/tools/explorer/tt_adapter/src/tt_adapter/main.py b/tools/explorer/tt_adapter/src/tt_adapter/main.py index 53ea686691..9d0307d113 100644 --- a/tools/explorer/tt_adapter/src/tt_adapter/main.py +++ b/tools/explorer/tt_adapter/src/tt_adapter/main.py @@ -10,7 +10,8 @@ class OptimizationPolicy(enum.Enum): DFSharding = "DF Sharding" - L1Interleaved = "L1 Interleaved" + GreedyL1Interleaved = "Greedy L1 Interleaved" + BFInterleaved = "BF Interleaved" OptimizerDisabled = "Optimizer Disabled" @@ -55,7 +56,9 @@ def convert( # Convert TTIR to Model Explorer Graphs and Display/Return graph, perf_data = mlir.build_graph(module, perf_trace) - return {"graphs": [graph], "perf_data": perf_data} + if perf_data: + graph = utils.add_to_dataclass(graph, "perf_data", perf_data.graphsData) + return {"graphs": [graph]} def execute( self, model_path: str, settings: Dict diff --git a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py index 843606b06c..eac036a38e 100644 --- a/tools/explorer/tt_adapter/src/tt_adapter/mlir.py +++ b/tools/explorer/tt_adapter/src/tt_adapter/mlir.py @@ -7,20 +7,14 @@ from model_explorer import graph_builder, node_data_builder from ttmlir.dialects import tt, ttnn, ttir -from ttmlir import ir +from ttmlir import ir, util def get_loc_str(loc): try: - # Constant loc( at the start of the location and ) at the end. Can just strip these characters - loc = str(loc) - if loc.startswith("loc(") and loc.endswith(")"): - # Fuzzy parse first string inside location - # 'loc("matmul_1"("MNISTLinear":4294967295:10))' -> matmul_1 - # TODO(odjuricic) Need to have this pybinded. - res = re.search(r'"([^"]+)"', loc).group(1) - else: - res = loc # This is a fallback to just visualize / see what the loc is if not processable. + res = util.get_loc_name(loc) + if res == "-": + res = util.get_loc_full(loc) except: res = "unknown" return res diff --git a/tools/explorer/tt_adapter/src/tt_adapter/utils.py b/tools/explorer/tt_adapter/src/tt_adapter/utils.py index 4b404a204b..6cc32c81ee 100644 --- a/tools/explorer/tt_adapter/src/tt_adapter/utils.py +++ b/tools/explorer/tt_adapter/src/tt_adapter/utils.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import ttmlir -from dataclasses import make_dataclass +from dataclasses import make_dataclass, is_dataclass, asdict def parse_mlir_file(model_path): @@ -18,5 +18,15 @@ def to_dataclass(obj: dict, dc_name: str = "tempClass"): return make_dataclass(dc_name, ((k, type(v)) for k, v in obj.items()))(**obj) -def to_adapter_format(obj: dict): - return {"graphs": [to_dataclass(obj)]} +def add_to_dataclass(dataclass, new_attr_name: str, new_attr_value): + if not is_dataclass(dataclass): + return None + classname = dataclass.__class__.__name__ + dataclass = asdict(dataclass) + dataclass[new_attr_name] = new_attr_value + return to_dataclass(dataclass, dc_name=classname) + + +def to_adapter_format(*objs): + res = [x if is_dataclass(x) else to_dataclass(x) for x in objs] + return {"graphs": res} diff --git a/tools/ttnn-standalone/CMakeLists.txt b/tools/ttnn-standalone/CMakeLists.txt index 23c78c7ca9..0be29d763d 100644 --- a/tools/ttnn-standalone/CMakeLists.txt +++ b/tools/ttnn-standalone/CMakeLists.txt @@ -63,7 +63,7 @@ set(INCLUDE_DIRS $ENV{TT_METAL_HOME} $ENV{TT_METAL_HOME}/tt_metal $ENV{TT_METAL_HOME}/tt_metal/third_party/umd - $ENV{TT_METAL_HOME}/tt_metal/third_party/umd/device + $ENV{TT_METAL_HOME}/tt_metal/third_party/umd/device/api $ENV{TT_METAL_HOME}/tt_metal/third_party/fmt $ENV{TT_METAL_HOME}/tt_metal/hw/inc $ENV{TT_METAL_HOME}/tt_metal/hw/inc/${ARCH_NAME}