diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h index 5b90a9a74..552fa6b5d 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h @@ -29,6 +29,7 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) { #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h.inc" namespace mlir::tt { +SystemDescAttr getCurrentScopeSystemDesc(Operation *op); DeviceAttr getCurrentScopeDevice(Operation *op); } // namespace mlir::tt diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td index 27a64d786..a75835520 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td @@ -101,6 +101,10 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> { let extraClassDeclaration = [{ static tt::SystemDescAttr getDefault(MLIRContext *context); static tt::SystemDescAttr getFromPath(MLIRContext *context, std::string& path); + unsigned getAddressAlignBytes(unsigned chipIndex = 0) const; + unsigned getNocL1AddressAlignBytes(unsigned chipIndex = 0) const; + unsigned getNocDRAMAddressAlignBytes(unsigned chipIndex = 0) const; + unsigned getPcieAddressAlignBytes(unsigned chipIndex = 0) const; }]; } @@ -186,6 +190,12 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> { GridAttr grid = {}, ArrayRef> collapseIntervals = {{0, -1}}, OOBVal oobVal = OOBVal::Undef); + LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef tensorShape, GridAttr grid, ArrayRef> collapseIntervals = {{0, -1}}); + LayoutAttr withGrid(::mlir::MLIRContext *context, + RankedTensorType ty, + GridAttr grid, + ArrayRef> collapseIntervals = {{0, -1}}); + MemorySpace getMemorySpace() const; bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); } bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); } @@ -193,11 +203,6 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> { llvm::SmallVector getStride(ArrayRef logicalShape) const; llvm::SmallVector getPhysicalShape(ArrayRef logicalShape) const; llvm::SmallVector getShardShape() const; - LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef tensorShape, GridAttr grid, ArrayRef> collapseIntervals = {{0, -1}}); - LayoutAttr withGrid(::mlir::MLIRContext *context, - RankedTensorType ty, - GridAttr grid, - ArrayRef> collapseIntervals = {{0, -1}}); LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType); }]; } @@ -240,6 +245,15 @@ def TT_OperandConstraintAttr : EnumAttr; +def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> { + let summary = "Argument allocation attribute in TT dialect"; + let description = [{ + Holds the metadata for the allocation of an function argument i.e. for graph inputs. + }]; + let parameters = (ins "uint64_t":$address, "uint64_t":$size, "MemorySpace":$memorySpace); + let assemblyFormat = "`<` $address `,` $size `,` $memorySpace `>`"; +} + //===----------------------------------------------------------------------===// // TT type definitions //===----------------------------------------------------------------------===// diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td index 99cae3ed2..d678da50d 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td @@ -60,8 +60,8 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> { let regions = (region AnyRegion:$region); } -def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> { - let summary = "ToLayout op."; +def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpInterface]> { + let summary = "Layout op."; let description = [{ ToLayout operation, transition tensors from one layout to another. Some examples include: - Transitioning between different memory spaces, e.g. DRAM to L1. @@ -83,6 +83,11 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> { let extraClassDeclaration = [{ MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); } + ArrayAttr getOperandConstraints() { + return nullptr; + // TODO return below, but we need a way to properly create an ArrayAttr: + // return {OperandConstraint::Any, OperandConstraint::Any}; + } }]; let hasVerifier = 1; diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td index 0c24b685a..64c8323fe 100644 --- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td +++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td @@ -21,6 +21,16 @@ def TTIROpInterface : OpInterface<"TTIROp"> { /*methodBody=*/"", /*defaultImplementation=*/"" >, + InterfaceMethod< + /*desc=*/[{ + Get the device of the current scope. + }], + /*retTy=*/"::mlir::tt::SystemDescAttr", + /*methodName=*/"getSystemDesc", + /*args=*/(ins), + /*methodBody=*/"", + /*defaultImplementation=*/"return ::mlir::tt::getCurrentScopeSystemDesc($_op);" + >, InterfaceMethod< /*desc=*/[{ Get the device of the current scope. diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h index e4f041c91..38a7c0250 100644 --- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h +++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h @@ -218,7 +218,7 @@ toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid, SmallVector tensorGridShape(tensorGrid.getShape()); AffineMap mapping = deviceGrid.getMapping(); ::ttmlir::utils::sample( - tensorGridShape, [&](SmallVector const &virtualCoreCoord) { + tensorGridShape, [&](ArrayRef virtualCoreCoord) { SmallVector coreCoord = mapping.compose(virtualCoreCoord); assert(coreCoord.size() == 3 && "expected a 2D core"); assert(coreCoord[0] == 0 && "expected single device"); diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h index 79297978e..2dda7f951 100644 --- a/include/ttmlir/Utils.h +++ b/include/ttmlir/Utils.h @@ -7,18 +7,23 @@ #include +#include "llvm/ADT/SmallVector.h" + namespace ttmlir::utils { +template T alignUp(T ptr, T alignment) { + return (ptr + alignment - 1) & ~(alignment - 1); +} template inline void sample(Vector const &shape, Fn fn) { - Vector strides(shape.size()); + llvm::SmallVector strides(shape.size()); std::int64_t stride = 1; for (std::int64_t i = shape.size() - 1; i >= 0; --i) { strides[i] = stride; stride *= shape[i]; } - Vector index(shape.size()); + llvm::SmallVector index(shape.size()); int64_t volume = stride; for (int64_t i = 0; i < volume; ++i) { for (unsigned j = 0; j < shape.size(); ++j) { diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp index d54bf6881..89d4da338 100644 --- a/lib/Dialect/TT/IR/TTOpsTypes.cpp +++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp @@ -8,6 +8,7 @@ #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/DialectImplementation.h" #include "ttmlir/Dialect/TT/IR/TT.h" #include "ttmlir/Target/Common/system_desc_generated.h" @@ -15,6 +16,8 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" +#include "ttmlir/Utils.h" + using namespace mlir::tt; #include "ttmlir/Dialect/TT/IR/TTOpsEnums.cpp.inc" @@ -126,6 +129,26 @@ mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) { return system_desc_attr; } +unsigned SystemDescAttr::getAddressAlignBytes(unsigned chipIndex) const { + return std::max(std::initializer_list{ + getNocL1AddressAlignBytes(), + getNocDRAMAddressAlignBytes(), + getPcieAddressAlignBytes(), + }); +} + +unsigned SystemDescAttr::getNocL1AddressAlignBytes(unsigned chipIndex) const { + return getChipDescs()[chipIndex].getNocL1AddressAlignBytes(); +} + +unsigned SystemDescAttr::getNocDRAMAddressAlignBytes(unsigned chipIndex) const { + return getChipDescs()[chipIndex].getNocDRAMAddressAlignBytes(); +} + +unsigned SystemDescAttr::getPcieAddressAlignBytes(unsigned chipIndex) const { + return getChipDescs()[chipIndex].getPcieAddressAlignBytes(); +} + static mlir::MemRefType buildMemRef(::mlir::MLIRContext *context, ::llvm::ArrayRef shardShape, ::mlir::Type elementType, @@ -479,6 +502,17 @@ uint64_t TileType::getSizeBytes() const { } } +SystemDescAttr mlir::tt::getCurrentScopeSystemDesc(mlir::Operation *op) { + while (op) { + if (auto systemDesc = + op->getAttrOfType(SystemDescAttr::name)) { + return systemDesc; + } + op = op->getParentOp(); + } + return nullptr; +} + DeviceAttr mlir::tt::getCurrentScopeDevice(mlir::Operation *op) { while (op) { if (auto device = op->getAttrOfType(DeviceAttr::name)) { diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp index 13eff68e7..4a9bcbbc0 100644 --- a/lib/Dialect/TTIR/IR/TTIROps.cpp +++ b/lib/Dialect/TTIR/IR/TTIROps.cpp @@ -23,6 +23,9 @@ ::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() { if (not outputLayout) { return emitOpError("Output tensor type missing layout attribute"); } + if (inputTy.getShape() != outputTy.getShape()) { + return emitOpError("Input and output shapes must be the same"); + } return success(); } diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp index 7d8d84a6b..ddc945a18 100644 --- a/lib/Dialect/TTIR/Transforms/Passes.cpp +++ b/lib/Dialect/TTIR/Transforms/Passes.cpp @@ -21,6 +21,7 @@ #include "ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h" #include "ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h" #include "ttmlir/Dialect/TTIR/Transforms/Passes.h" +#include "ttmlir/Utils.h" namespace mlir::tt::ttir { #define GEN_PASS_DEF_TTIRGENERIC @@ -581,14 +582,14 @@ inline uint64_t getTensorMemrefSizeBytes(RankedTensorType ty) { class TTIRAllocate : public impl::TTIRAllocateBase { struct SimpleAllocator { static constexpr uint64_t kBaseAddress = 1llu << 18llu; + uint64_t addressAlignment; + + SimpleAllocator(uint64_t addressAlignment) + : addressAlignment(addressAlignment) {} SmallVector currPtr = SmallVector( getMaxEnumValForMemorySpace() + 1llu, kBaseAddress); - uint64_t alignUp(uint64_t ptr, uint64_t alignment) { - return (ptr + alignment - 1) & ~(alignment - 1); - } - uint64_t allocate(uint64_t size, MemorySpace memorySpace) { if (isSystemMemorySpace(memorySpace)) { return 0; @@ -597,7 +598,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase { uint32_t index = static_cast(memorySpace); assert(index < currPtr.size()); uint64_t &ptr = currPtr[index]; - ptr = alignUp(ptr, 16); + ptr = ttmlir::utils::alignUp(ptr, addressAlignment); auto result = ptr; ptr += size; return result; @@ -638,10 +639,27 @@ class TTIRAllocate : public impl::TTIRAllocateBase { module->walk([&](func::FuncOp func) { assert(func.getBody().hasOneBlock()); - SimpleAllocator allocator; + auto systemDesc = getCurrentScopeSystemDesc(func); + assert(systemDesc); + auto addressAlignment = systemDesc.getAddressAlignBytes(); + SimpleAllocator allocator(addressAlignment); Liveness liveness(func.getOperation()); const LivenessBlockInfo *livenessInfo = liveness.getLiveness(&func.getBody().front()); + + mlir::SmallVector argumentAllocations; + for (auto operand : func.getArguments()) { + auto operandTy = mlir::cast(operand.getType()); + assert(operandTy.getEncoding()); + auto memorySpace = getMemorySpace(operandTy); + auto sizeBytes = getTensorMemrefSizeBytes(operandTy); + auto address = allocator.allocate(sizeBytes, memorySpace); + argumentAllocations.push_back(rewriter.getAttr( + address, sizeBytes, memorySpace)); + } + func->setDiscardableAttr("argument_allocations", + rewriter.getArrayAttr(argumentAllocations)); + func->walk([&](tensor::EmptyOp empty) { auto resultTy = mlir::cast(empty.getResult().getType()); diff --git a/lib/Dialect/TTMetal/Transforms/Passes.cpp b/lib/Dialect/TTMetal/Transforms/Passes.cpp index 099318a22..05b5051b3 100644 --- a/lib/Dialect/TTMetal/Transforms/Passes.cpp +++ b/lib/Dialect/TTMetal/Transforms/Passes.cpp @@ -250,6 +250,8 @@ class ConvertTTIRToTTMetal }; void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) { + pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc()); + pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice()); pm.addPass(mlir::tt::ttir::createTTIRGeneric()); pm.addPass(mlir::tt::ttir::createTTIRLayout()); pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref()); diff --git a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp b/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp index 59e4d7e14..c2076f995 100644 --- a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp +++ b/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp @@ -103,9 +103,6 @@ class TTMetalSerializeToBinary } void runOnOperation() final { - constexpr uint64_t kHostAllocatedAddress = 0; - constexpr uint64_t kHostAllocatedSize = 0; - ::flatbuffers::FlatBufferBuilder fbb; FlatbufferObjectCache cache(&fbb); CQBuilder cqBuilder(&fbb); @@ -117,10 +114,22 @@ class TTMetalSerializeToBinary assert(entry && "expected an entry function"); cqBuilder.name = entry.getSymName().data(); + auto argumentAllocations = mlir::cast( + entry->getDiscardableAttr("argument_allocations")); + assert(argumentAllocations && "expected argument_allocations attribute"); for (auto &input : entry.getBody().getArguments()) { + auto argAlloc = mlir::cast( + argumentAllocations[input.getArgNumber()]); + assert( + argAlloc.getMemorySpace() == + mlir::cast( + mlir::cast(input.getType()).getEncoding()) + .getMemorySpace() && + "argument allocation memory space does not match tensor type memory " + "space"); cqBuilder.inputs.push_back( cache.getOrCreate(input, tensorValueToFlatbuffer, - kHostAllocatedAddress, kHostAllocatedSize)); + argAlloc.getAddress(), argAlloc.getSize())); } module->walk([&](mlir::Operation *op) { @@ -145,17 +154,16 @@ class TTMetalSerializeToBinary mlir::cast( dispatchOp.getThreadTypes()[region.getRegionNumber()]) .getValue(); - std::vector<::tt::target::Dim2dRange> core_range = { + std::vector<::tt::target::Dim2dRange> coreRangeSet = { toFlatbuffer(mlir::cast( - dispatchOp.getCoreRanges()[region.getRegionNumber()])), - }; + dispatchOp.getCoreRanges()[region.getRegionNumber()]))}; std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs; kernels.push_back(::tt::target::metal::CreateKernelDescDirect( fbb, ::tt::target::metal::Kernel::KernelSource, ::tt::target::metal::CreateKernelSourceDirect( fbb, toFlatbuffer(threadType), source.c_str()) .Union(), - &core_range, &cbs, nullptr /*TODO debug info*/)); + &coreRangeSet, &cbs, nullptr /*TODO debug info*/)); } ::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program = ::tt::target::metal::CreateProgramDescDirect(fbb, &kernels); diff --git a/test/ttmlir/Dialect/TTIR/test_allocate.mlir b/test/ttmlir/Dialect/TTIR/test_allocate.mlir index 9b8230e41..0968a67c7 100644 --- a/test/ttmlir/Dialect/TTIR/test_allocate.mlir +++ b/test/ttmlir/Dialect/TTIR/test_allocate.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-layout --ttir-allocate %s | FileCheck %s +// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttir-allocate %s | FileCheck %s #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { diff --git a/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir b/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir index c1588daec..d65b81a10 100644 --- a/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir +++ b/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s +// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> { diff --git a/test/ttmlir/Dialect/TTMetal/to_layout.mlir b/test/ttmlir/Dialect/TTMetal/to_layout.mlir index f7681e447..50dc32d11 100644 --- a/test/ttmlir/Dialect/TTMetal/to_layout.mlir +++ b/test/ttmlir/Dialect/TTMetal/to_layout.mlir @@ -1,4 +1,4 @@ -// RUN: ttmlir-opt --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s +// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s #l1_ = #tt.memory_space #layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>> #layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<64x32xf32, #l1_>>