Skip to content

Commit

Permalink
Feed HW alignment requirements from system desc into allocator
Browse files Browse the repository at this point in the history
- Enforce the Noc/DRAM/PCIE alignment requirements taken from the system
  descriptor on the allocator.
- Annotate function arguments with new attribute "argument_allocations"
  with info about where inputs are allocated.
  • Loading branch information
nsmithtt committed Aug 7, 2024
1 parent 2bdb825 commit 1680064
Show file tree
Hide file tree
Showing 14 changed files with 127 additions and 27 deletions.
1 change: 1 addition & 0 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) {
#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h.inc"

namespace mlir::tt {
SystemDescAttr getCurrentScopeSystemDesc(Operation *op);
DeviceAttr getCurrentScopeDevice(Operation *op);
} // namespace mlir::tt

Expand Down
24 changes: 19 additions & 5 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> {
let extraClassDeclaration = [{
static tt::SystemDescAttr getDefault(MLIRContext *context);
static tt::SystemDescAttr getFromPath(MLIRContext *context, std::string& path);
unsigned getAddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getNocL1AddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getNocDRAMAddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getPcieAddressAlignBytes(unsigned chipIndex = 0) const;
}];
}

Expand Down Expand Up @@ -186,18 +190,19 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
GridAttr grid = {},
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
OOBVal oobVal = OOBVal::Undef);
LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withGrid(::mlir::MLIRContext *context,
RankedTensorType ty,
GridAttr grid,
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});

MemorySpace getMemorySpace() const;
bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); }
bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); }
Type getElementType() const;
llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getPhysicalShape(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getShardShape() const;
LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withGrid(::mlir::MLIRContext *context,
RankedTensorType ty,
GridAttr grid,
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
}];
}
Expand Down Expand Up @@ -240,6 +245,15 @@ def TT_OperandConstraintAttr : EnumAttr<TT_Dialect, TT_OperandConstraint, "opera

def TT_OperandConstraintArrayAttr : TypedArrayAttrBase<TT_OperandConstraintAttr, "">;

def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> {
let summary = "Argument allocation attribute in TT dialect";
let description = [{
Holds the metadata for the allocation of an function argument i.e. for graph inputs.
}];
let parameters = (ins "uint64_t":$address, "uint64_t":$size, "MemorySpace":$memorySpace);
let assemblyFormat = "`<` $address `,` $size `,` $memorySpace `>`";
}

//===----------------------------------------------------------------------===//
// TT type definitions
//===----------------------------------------------------------------------===//
Expand Down
9 changes: 7 additions & 2 deletions include/ttmlir/Dialect/TTIR/IR/TTIROps.td
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> {
let regions = (region AnyRegion:$region);
}

def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> {
let summary = "ToLayout op.";
def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpInterface]> {
let summary = "Layout op.";
let description = [{
ToLayout operation, transition tensors from one layout to another. Some examples include:
- Transitioning between different memory spaces, e.g. DRAM to L1.
Expand All @@ -83,6 +83,11 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> {

let extraClassDeclaration = [{
MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
ArrayAttr getOperandConstraints() {
return nullptr;
// TODO return below, but we need a way to properly create an ArrayAttr:
// return {OperandConstraint::Any, OperandConstraint::Any};
}
}];

let hasVerifier = 1;
Expand Down
10 changes: 10 additions & 0 deletions include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ def TTIROpInterface : OpInterface<"TTIROp"> {
/*methodBody=*/"",
/*defaultImplementation=*/""
>,
InterfaceMethod<
/*desc=*/[{
Get the device of the current scope.
}],
/*retTy=*/"::mlir::tt::SystemDescAttr",
/*methodName=*/"getSystemDesc",
/*args=*/(ins),
/*methodBody=*/"",
/*defaultImplementation=*/"return ::mlir::tt::getCurrentScopeSystemDesc($_op);"
>,
InterfaceMethod<
/*desc=*/[{
Get the device of the current scope.
Expand Down
2 changes: 1 addition & 1 deletion include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid,
SmallVector<std::int64_t> tensorGridShape(tensorGrid.getShape());
AffineMap mapping = deviceGrid.getMapping();
::ttmlir::utils::sample(
tensorGridShape, [&](SmallVector<std::int64_t> const &virtualCoreCoord) {
tensorGridShape, [&](ArrayRef<std::int64_t> virtualCoreCoord) {
SmallVector<std::int64_t> coreCoord = mapping.compose(virtualCoreCoord);
assert(coreCoord.size() == 3 && "expected a 2D core");
assert(coreCoord[0] == 0 && "expected single device");
Expand Down
9 changes: 7 additions & 2 deletions include/ttmlir/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

#include <cstdint>

#include "llvm/ADT/SmallVector.h"

namespace ttmlir::utils {
template <typename T> T alignUp(T ptr, T alignment) {
return (ptr + alignment - 1) & ~(alignment - 1);
}

template <typename Vector, typename Fn>
inline void sample(Vector const &shape, Fn fn) {
Vector strides(shape.size());
llvm::SmallVector<std::int64_t, 8> strides(shape.size());
std::int64_t stride = 1;
for (std::int64_t i = shape.size() - 1; i >= 0; --i) {
strides[i] = stride;
stride *= shape[i];
}

Vector index(shape.size());
llvm::SmallVector<std::int64_t, 8> index(shape.size());
int64_t volume = stride;
for (int64_t i = 0; i < volume; ++i) {
for (unsigned j = 0; j < shape.size(); ++j) {
Expand Down
34 changes: 34 additions & 0 deletions lib/Dialect/TT/IR/TTOpsTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@
#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"

#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/DialectImplementation.h"
#include "ttmlir/Dialect/TT/IR/TT.h"
#include "ttmlir/Target/Common/system_desc_generated.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/TypeSwitch.h"

#include "ttmlir/Utils.h"

using namespace mlir::tt;

#include "ttmlir/Dialect/TT/IR/TTOpsEnums.cpp.inc"
Expand Down Expand Up @@ -126,6 +129,26 @@ mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) {
return system_desc_attr;
}

unsigned SystemDescAttr::getAddressAlignBytes(unsigned chipIndex) const {
return std::max(std::initializer_list<unsigned>{
getNocL1AddressAlignBytes(),
getNocDRAMAddressAlignBytes(),
getPcieAddressAlignBytes(),
});
}

unsigned SystemDescAttr::getNocL1AddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getNocL1AddressAlignBytes();
}

unsigned SystemDescAttr::getNocDRAMAddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getNocDRAMAddressAlignBytes();
}

unsigned SystemDescAttr::getPcieAddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getPcieAddressAlignBytes();
}

static mlir::MemRefType buildMemRef(::mlir::MLIRContext *context,
::llvm::ArrayRef<int64_t> shardShape,
::mlir::Type elementType,
Expand Down Expand Up @@ -479,6 +502,17 @@ uint64_t TileType::getSizeBytes() const {
}
}

SystemDescAttr mlir::tt::getCurrentScopeSystemDesc(mlir::Operation *op) {
while (op) {
if (auto systemDesc =
op->getAttrOfType<SystemDescAttr>(SystemDescAttr::name)) {
return systemDesc;
}
op = op->getParentOp();
}
return nullptr;
}

DeviceAttr mlir::tt::getCurrentScopeDevice(mlir::Operation *op) {
while (op) {
if (auto device = op->getAttrOfType<DeviceAttr>(DeviceAttr::name)) {
Expand Down
3 changes: 3 additions & 0 deletions lib/Dialect/TTIR/IR/TTIROps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ ::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() {
if (not outputLayout) {
return emitOpError("Output tensor type missing layout attribute");
}
if (inputTy.getShape() != outputTy.getShape()) {
return emitOpError("Input and output shapes must be the same");
}
return success();
}

Expand Down
30 changes: 24 additions & 6 deletions lib/Dialect/TTIR/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h"
#include "ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h"
#include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
#include "ttmlir/Utils.h"

namespace mlir::tt::ttir {
#define GEN_PASS_DEF_TTIRGENERIC
Expand Down Expand Up @@ -581,14 +582,14 @@ inline uint64_t getTensorMemrefSizeBytes(RankedTensorType ty) {
class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
struct SimpleAllocator {
static constexpr uint64_t kBaseAddress = 1llu << 18llu;
uint64_t addressAlignment;

SimpleAllocator(uint64_t addressAlignment)
: addressAlignment(addressAlignment) {}

SmallVector<uint64_t> currPtr = SmallVector<uint64_t>(
getMaxEnumValForMemorySpace() + 1llu, kBaseAddress);

uint64_t alignUp(uint64_t ptr, uint64_t alignment) {
return (ptr + alignment - 1) & ~(alignment - 1);
}

uint64_t allocate(uint64_t size, MemorySpace memorySpace) {
if (isSystemMemorySpace(memorySpace)) {
return 0;
Expand All @@ -597,7 +598,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
uint32_t index = static_cast<uint32_t>(memorySpace);
assert(index < currPtr.size());
uint64_t &ptr = currPtr[index];
ptr = alignUp(ptr, 16);
ptr = ttmlir::utils::alignUp(ptr, addressAlignment);
auto result = ptr;
ptr += size;
return result;
Expand Down Expand Up @@ -638,10 +639,27 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {

module->walk([&](func::FuncOp func) {
assert(func.getBody().hasOneBlock());
SimpleAllocator allocator;
auto systemDesc = getCurrentScopeSystemDesc(func);
assert(systemDesc);
auto addressAlignment = systemDesc.getAddressAlignBytes();
SimpleAllocator allocator(addressAlignment);
Liveness liveness(func.getOperation());
const LivenessBlockInfo *livenessInfo =
liveness.getLiveness(&func.getBody().front());

mlir::SmallVector<Attribute> argumentAllocations;
for (auto operand : func.getArguments()) {
auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
assert(operandTy.getEncoding());
auto memorySpace = getMemorySpace(operandTy);
auto sizeBytes = getTensorMemrefSizeBytes(operandTy);
auto address = allocator.allocate(sizeBytes, memorySpace);
argumentAllocations.push_back(rewriter.getAttr<ArgumentAllocationAttr>(
address, sizeBytes, memorySpace));
}
func->setDiscardableAttr("argument_allocations",
rewriter.getArrayAttr(argumentAllocations));

func->walk([&](tensor::EmptyOp empty) {
auto resultTy =
mlir::cast<RankedTensorType>(empty.getResult().getType());
Expand Down
2 changes: 2 additions & 0 deletions lib/Dialect/TTMetal/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ class ConvertTTIRToTTMetal
};

void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) {
pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc());
pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice());
pm.addPass(mlir::tt::ttir::createTTIRGeneric());
pm.addPass(mlir::tt::ttir::createTTIRLayout());
pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref());
Expand Down
24 changes: 16 additions & 8 deletions lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ class TTMetalSerializeToBinary
}

void runOnOperation() final {
constexpr uint64_t kHostAllocatedAddress = 0;
constexpr uint64_t kHostAllocatedSize = 0;

::flatbuffers::FlatBufferBuilder fbb;
FlatbufferObjectCache cache(&fbb);
CQBuilder cqBuilder(&fbb);
Expand All @@ -117,10 +114,22 @@ class TTMetalSerializeToBinary
assert(entry && "expected an entry function");
cqBuilder.name = entry.getSymName().data();

auto argumentAllocations = mlir::cast<ArrayAttr>(
entry->getDiscardableAttr("argument_allocations"));
assert(argumentAllocations && "expected argument_allocations attribute");
for (auto &input : entry.getBody().getArguments()) {
auto argAlloc = mlir::cast<tt::ArgumentAllocationAttr>(
argumentAllocations[input.getArgNumber()]);
assert(
argAlloc.getMemorySpace() ==
mlir::cast<tt::LayoutAttr>(
mlir::cast<RankedTensorType>(input.getType()).getEncoding())
.getMemorySpace() &&
"argument allocation memory space does not match tensor type memory "
"space");
cqBuilder.inputs.push_back(
cache.getOrCreate(input, tensorValueToFlatbuffer,
kHostAllocatedAddress, kHostAllocatedSize));
argAlloc.getAddress(), argAlloc.getSize()));
}

module->walk([&](mlir::Operation *op) {
Expand All @@ -145,17 +154,16 @@ class TTMetalSerializeToBinary
mlir::cast<ttkernel::ThreadTypeAttr>(
dispatchOp.getThreadTypes()[region.getRegionNumber()])
.getValue();
std::vector<::tt::target::Dim2dRange> core_range = {
std::vector<::tt::target::Dim2dRange> coreRangeSet = {
toFlatbuffer(mlir::cast<CoreRangeAttr>(
dispatchOp.getCoreRanges()[region.getRegionNumber()])),
};
dispatchOp.getCoreRanges()[region.getRegionNumber()]))};
std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs;
kernels.push_back(::tt::target::metal::CreateKernelDescDirect(
fbb, ::tt::target::metal::Kernel::KernelSource,
::tt::target::metal::CreateKernelSourceDirect(
fbb, toFlatbuffer(threadType), source.c_str())
.Union(),
&core_range, &cbs, nullptr /*TODO debug info*/));
&coreRangeSet, &cbs, nullptr /*TODO debug info*/));
}
::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =
::tt::target::metal::CreateProgramDescDirect(fbb, &kernels);
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTIR/test_allocate.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-layout --ttir-allocate %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttir-allocate %s | FileCheck %s
#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
module attributes {} {
func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTMetal/simple_multiply.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
module attributes {} {
func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTMetal/to_layout.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
#l1_ = #tt.memory_space<l1>
#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<64x32xf32, #l1_>>
Expand Down

0 comments on commit 1680064

Please sign in to comment.