Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feed HW alignment requirements from system desc into allocator #310

Merged
merged 2 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) {
#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h.inc"

namespace mlir::tt {
SystemDescAttr getCurrentScopeSystemDesc(Operation *op);
DeviceAttr getCurrentScopeDevice(Operation *op);
} // namespace mlir::tt

Expand Down
24 changes: 19 additions & 5 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> {
let extraClassDeclaration = [{
static tt::SystemDescAttr getDefault(MLIRContext *context);
static tt::SystemDescAttr getFromPath(MLIRContext *context, std::string& path);
unsigned getAddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getNocL1AddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getNocDRAMAddressAlignBytes(unsigned chipIndex = 0) const;
unsigned getPcieAddressAlignBytes(unsigned chipIndex = 0) const;
}];
}

Expand Down Expand Up @@ -186,18 +190,19 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
GridAttr grid = {},
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}},
OOBVal oobVal = OOBVal::Undef);
LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withGrid(::mlir::MLIRContext *context,
RankedTensorType ty,
GridAttr grid,
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});

MemorySpace getMemorySpace() const;
bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); }
bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); }
Type getElementType() const;
llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getPhysicalShape(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getShardShape() const;
LayoutAttr withGrid(::mlir::MLIRContext *context, ArrayRef<int64_t> tensorShape, GridAttr grid, ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withGrid(::mlir::MLIRContext *context,
RankedTensorType ty,
GridAttr grid,
ArrayRef<std::pair<std::int64_t, std::int64_t>> collapseIntervals = {{0, -1}});
LayoutAttr withElementType(::mlir::MLIRContext *context, Type elementType);
}];
}
Expand Down Expand Up @@ -240,6 +245,15 @@ def TT_OperandConstraintAttr : EnumAttr<TT_Dialect, TT_OperandConstraint, "opera

def TT_OperandConstraintArrayAttr : TypedArrayAttrBase<TT_OperandConstraintAttr, "">;

def TT_ArgumentAllocationAttr : TT_Attr<"ArgumentAllocation", "arg_alloc", []> {
let summary = "Argument allocation attribute in TT dialect";
let description = [{
Holds the metadata for the allocation of an function argument i.e. for graph inputs.
}];
let parameters = (ins "uint64_t":$address, "uint64_t":$size, "MemorySpace":$memorySpace);
let assemblyFormat = "`<` $address `,` $size `,` $memorySpace `>`";
}

//===----------------------------------------------------------------------===//
// TT type definitions
//===----------------------------------------------------------------------===//
Expand Down
9 changes: 7 additions & 2 deletions include/ttmlir/Dialect/TTIR/IR/TTIROps.td
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> {
let regions = (region AnyRegion:$region);
}

def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> {
let summary = "ToLayout op.";
def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface, TTIROpInterface]> {
let summary = "Layout op.";
let description = [{
ToLayout operation, transition tensors from one layout to another. Some examples include:
- Transitioning between different memory spaces, e.g. DRAM to L1.
Expand All @@ -83,6 +83,11 @@ def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> {

let extraClassDeclaration = [{
MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
ArrayAttr getOperandConstraints() {
return nullptr;
// TODO return below, but we need a way to properly create an ArrayAttr:
// return {OperandConstraint::Any, OperandConstraint::Any};
}
}];

let hasVerifier = 1;
Expand Down
10 changes: 10 additions & 0 deletions include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ def TTIROpInterface : OpInterface<"TTIROp"> {
/*methodBody=*/"",
/*defaultImplementation=*/""
>,
InterfaceMethod<
/*desc=*/[{
Get the device of the current scope.
}],
/*retTy=*/"::mlir::tt::SystemDescAttr",
/*methodName=*/"getSystemDesc",
/*args=*/(ins),
/*methodBody=*/"",
/*defaultImplementation=*/"return ::mlir::tt::getCurrentScopeSystemDesc($_op);"
>,
InterfaceMethod<
/*desc=*/[{
Get the device of the current scope.
Expand Down
2 changes: 1 addition & 1 deletion include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid,
SmallVector<std::int64_t> tensorGridShape(tensorGrid.getShape());
AffineMap mapping = deviceGrid.getMapping();
::ttmlir::utils::sample(
tensorGridShape, [&](SmallVector<std::int64_t> const &virtualCoreCoord) {
tensorGridShape, [&](ArrayRef<std::int64_t> virtualCoreCoord) {
SmallVector<std::int64_t> coreCoord = mapping.compose(virtualCoreCoord);
assert(coreCoord.size() == 3 && "expected a 2D core");
assert(coreCoord[0] == 0 && "expected single device");
Expand Down
9 changes: 7 additions & 2 deletions include/ttmlir/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@

#include <cstdint>

#include "llvm/ADT/SmallVector.h"

namespace ttmlir::utils {
template <typename T> T alignUp(T ptr, T alignment) {
return (ptr + alignment - 1) & ~(alignment - 1);
}

template <typename Vector, typename Fn>
inline void sample(Vector const &shape, Fn fn) {
Vector strides(shape.size());
llvm::SmallVector<std::int64_t, 8> strides(shape.size());
std::int64_t stride = 1;
for (std::int64_t i = shape.size() - 1; i >= 0; --i) {
strides[i] = stride;
stride *= shape[i];
}

Vector index(shape.size());
llvm::SmallVector<std::int64_t, 8> index(shape.size());
int64_t volume = stride;
for (int64_t i = 0; i < volume; ++i) {
for (unsigned j = 0; j < shape.size(); ++j) {
Expand Down
38 changes: 38 additions & 0 deletions lib/Dialect/TT/IR/TTOpsTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@
#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"

#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/DialectImplementation.h"
#include "ttmlir/Dialect/TT/IR/TT.h"
#include "ttmlir/Target/Common/system_desc_generated.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/TypeSwitch.h"

#include "ttmlir/Utils.h"

using namespace mlir::tt;

#include "ttmlir/Dialect/TT/IR/TTOpsEnums.cpp.inc"
Expand Down Expand Up @@ -126,6 +129,26 @@ mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) {
return system_desc_attr;
}

unsigned SystemDescAttr::getAddressAlignBytes(unsigned chipIndex) const {
return std::max(std::initializer_list<unsigned>{
getNocL1AddressAlignBytes(),
getNocDRAMAddressAlignBytes(),
getPcieAddressAlignBytes(),
});
}

unsigned SystemDescAttr::getNocL1AddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getNocL1AddressAlignBytes();
}

unsigned SystemDescAttr::getNocDRAMAddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getNocDRAMAddressAlignBytes();
}

unsigned SystemDescAttr::getPcieAddressAlignBytes(unsigned chipIndex) const {
return getChipDescs()[chipIndex].getPcieAddressAlignBytes();
}

static mlir::MemRefType buildMemRef(::mlir::MLIRContext *context,
::llvm::ArrayRef<int64_t> shardShape,
::mlir::Type elementType,
Expand Down Expand Up @@ -479,6 +502,21 @@ uint64_t TileType::getSizeBytes() const {
}
}

SystemDescAttr mlir::tt::getCurrentScopeSystemDesc(mlir::Operation *op) {
// Walk up scope levels until we find the top level ModuleOp which carries the
// system desc
while (op) {
if (mlir::isa<mlir::ModuleOp>(op)) {
auto systemDesc = op->getAttrOfType<SystemDescAttr>(SystemDescAttr::name);
assert(systemDesc && "expected system desc to be present on the module");
return systemDesc;
}
op = op->getParentOp();
}
assert(false && "expected system desc to be present in the scope");
return nullptr;
}

DeviceAttr mlir::tt::getCurrentScopeDevice(mlir::Operation *op) {
while (op) {
if (auto device = op->getAttrOfType<DeviceAttr>(DeviceAttr::name)) {
Expand Down
3 changes: 3 additions & 0 deletions lib/Dialect/TTIR/IR/TTIROps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ ::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() {
if (not outputLayout) {
return emitOpError("Output tensor type missing layout attribute");
}
if (inputTy.getShape() != outputTy.getShape()) {
return emitOpError("Input and output shapes must be the same");
}
return success();
}

Expand Down
30 changes: 24 additions & 6 deletions lib/Dialect/TTIR/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h"
#include "ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h"
#include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
#include "ttmlir/Utils.h"

namespace mlir::tt::ttir {
#define GEN_PASS_DEF_TTIRGENERIC
Expand Down Expand Up @@ -581,14 +582,14 @@ inline uint64_t getTensorMemrefSizeBytes(RankedTensorType ty) {
class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
struct SimpleAllocator {
static constexpr uint64_t kBaseAddress = 1llu << 18llu;
uint64_t addressAlignment;

SimpleAllocator(uint64_t addressAlignment)
: addressAlignment(addressAlignment) {}

SmallVector<uint64_t> currPtr = SmallVector<uint64_t>(
getMaxEnumValForMemorySpace() + 1llu, kBaseAddress);

uint64_t alignUp(uint64_t ptr, uint64_t alignment) {
return (ptr + alignment - 1) & ~(alignment - 1);
}

uint64_t allocate(uint64_t size, MemorySpace memorySpace) {
if (isSystemMemorySpace(memorySpace)) {
return 0;
Expand All @@ -597,7 +598,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
uint32_t index = static_cast<uint32_t>(memorySpace);
assert(index < currPtr.size());
uint64_t &ptr = currPtr[index];
ptr = alignUp(ptr, 16);
ptr = ttmlir::utils::alignUp(ptr, addressAlignment);
auto result = ptr;
ptr += size;
return result;
Expand Down Expand Up @@ -638,10 +639,27 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {

module->walk([&](func::FuncOp func) {
assert(func.getBody().hasOneBlock());
SimpleAllocator allocator;
auto systemDesc = getCurrentScopeSystemDesc(func);
assert(systemDesc);
auto addressAlignment = systemDesc.getAddressAlignBytes();
SimpleAllocator allocator(addressAlignment);
Liveness liveness(func.getOperation());
const LivenessBlockInfo *livenessInfo =
liveness.getLiveness(&func.getBody().front());

mlir::SmallVector<Attribute> argumentAllocations;
for (auto operand : func.getArguments()) {
auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
assert(operandTy.getEncoding());
auto memorySpace = getMemorySpace(operandTy);
auto sizeBytes = getTensorMemrefSizeBytes(operandTy);
auto address = allocator.allocate(sizeBytes, memorySpace);
argumentAllocations.push_back(rewriter.getAttr<ArgumentAllocationAttr>(
address, sizeBytes, memorySpace));
}
func->setDiscardableAttr(ArgumentAllocationAttr::name,
rewriter.getArrayAttr(argumentAllocations));

func->walk([&](tensor::EmptyOp empty) {
auto resultTy =
mlir::cast<RankedTensorType>(empty.getResult().getType());
Expand Down
2 changes: 2 additions & 0 deletions lib/Dialect/TTMetal/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ class ConvertTTIRToTTMetal
};

void createTTIRToTTMetalBackendPipeline(OpPassManager &pm) {
pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc());
pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice());
pm.addPass(mlir::tt::ttir::createTTIRGeneric());
pm.addPass(mlir::tt::ttir::createTTIRLayout());
pm.addPass(mlir::tt::ttir::createTTIRGenericRegionOperandsToMemref());
Expand Down
24 changes: 16 additions & 8 deletions lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ class TTMetalSerializeToBinary
}

void runOnOperation() final {
constexpr uint64_t kHostAllocatedAddress = 0;
constexpr uint64_t kHostAllocatedSize = 0;

::flatbuffers::FlatBufferBuilder fbb;
FlatbufferObjectCache cache(&fbb);
CQBuilder cqBuilder(&fbb);
Expand All @@ -117,10 +114,22 @@ class TTMetalSerializeToBinary
assert(entry && "expected an entry function");
cqBuilder.name = entry.getSymName().data();

auto argumentAllocations = mlir::cast<ArrayAttr>(
entry->getDiscardableAttr(ArgumentAllocationAttr::name));
assert(argumentAllocations && "expected tt.argument_allocations attribute");
for (auto &input : entry.getBody().getArguments()) {
auto argAlloc = mlir::cast<tt::ArgumentAllocationAttr>(
argumentAllocations[input.getArgNumber()]);
assert(
argAlloc.getMemorySpace() ==
mlir::cast<tt::LayoutAttr>(
mlir::cast<RankedTensorType>(input.getType()).getEncoding())
.getMemorySpace() &&
"argument allocation memory space does not match tensor type memory "
"space");
cqBuilder.inputs.push_back(
cache.getOrCreate(input, tensorValueToFlatbuffer,
kHostAllocatedAddress, kHostAllocatedSize));
argAlloc.getAddress(), argAlloc.getSize()));
}

module->walk([&](mlir::Operation *op) {
Expand All @@ -145,17 +154,16 @@ class TTMetalSerializeToBinary
mlir::cast<ttkernel::ThreadTypeAttr>(
dispatchOp.getThreadTypes()[region.getRegionNumber()])
.getValue();
std::vector<::tt::target::Dim2dRange> core_range = {
std::vector<::tt::target::Dim2dRange> coreRangeSet = {
toFlatbuffer(mlir::cast<CoreRangeAttr>(
dispatchOp.getCoreRanges()[region.getRegionNumber()])),
};
dispatchOp.getCoreRanges()[region.getRegionNumber()]))};
std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs;
kernels.push_back(::tt::target::metal::CreateKernelDescDirect(
fbb, ::tt::target::metal::Kernel::KernelSource,
::tt::target::metal::CreateKernelSourceDirect(
fbb, toFlatbuffer(threadType), source.c_str())
.Union(),
&core_range, &cbs, nullptr /*TODO debug info*/));
&coreRangeSet, &cbs, nullptr /*TODO debug info*/));
}
::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =
::tt::target::metal::CreateProgramDescDirect(fbb, &kernels);
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTIR/test_allocate.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-layout --ttir-allocate %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttir-allocate %s | FileCheck %s
#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
module attributes {} {
func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTMetal/simple_multiply.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
module attributes {} {
func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
Expand Down
2 changes: 1 addition & 1 deletion test/ttmlir/Dialect/TTMetal/to_layout.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: ttmlir-opt --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
#l1_ = #tt.memory_space<l1>
#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<64x32xf32, #l1_>>
Expand Down
Loading