Skip to content

Commit

Permalink
Object FIFO: Introduce new dynamic lowering (Xilinx#1798)
Browse files Browse the repository at this point in the history
Co-authored-by: André Rösti <[email protected]>
Co-authored-by: AndraBisca <[email protected]>
Co-authored-by: Pranathi Vasireddy <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
5 people authored Oct 15, 2024
1 parent 9e95ec0 commit fa85462
Show file tree
Hide file tree
Showing 42 changed files with 2,385 additions and 11 deletions.
45 changes: 44 additions & 1 deletion docs/AIEDesignPatterns.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,9 @@ Operations can be performed on the objectFIFO in the cores: elements can be acqu
}
```

For correct execution, loops that contain objectFIFO operations must be unrolled based on objectFIFO size; the previous code in core12 becomes:
For correct execution, objectfifo operations must be lowered such that each iteration of execution, new elements are accessed (based on acquire / release patterns). Two different lowering techniques are described below:

In the default lowering, loops that contain objectFIFO operations are unrolled based on objectFIFO size; the previous code in core12 becomes:
```
%core12 = AIE.core(%tile12) {
%c0 = arith.constant 0 : index
Expand All @@ -416,6 +418,47 @@ For correct execution, loops that contain objectFIFO operations must be unrolled
}
```

Another lowering technique generates MLIR operations that ensure the acquire / release patterns are taken into account at runtime and their effects are stored in a global buffer. This global state buffer is then used to correctly access objectfifos using a SCF.IndexSwitchOps; the previous code in core12 becomes:
```
%of0_buff_0 = aie.buffer(%tile_0_2) {sym_name = "of0_buff_0"} : memref<16xi32>
%of0_buff_1 = aie.buffer(%tile_0_2) {sym_name = "of0_buff_1"} : memref<16xi32>
%of0_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "of0_prod_lock"}
%of0_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "of0_cons_lock"}
%buffer_0_2 = aie.buffer(%tile_0_2) : memref<1xindex>
%core_0_2 = aie.core(%tile_0_2) {
%c0 = arith.constant 0 : index
%c0_0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
memref.store %c0, %buffer_0_2[%c0_0] : memref<1xindex>
%c0_1 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c12 = arith.constant 12 : index
scf.for %arg0 = %c0_1 to %c12 step %c1 {
aie.use_lock(%of0_prod_lock, AcquireGreaterEqual, 1)
%0 = memref.load %buffer_0_2[%c0_0] : memref<1xindex>
%1 = scf.index_switch %0 -> memref<16xi32>
case 0 {
scf.yield %of0_buff_0 : memref<16xi32>
}
case 1 {
scf.yield %of0_buff_1 : memref<16xi32>
}
default {
scf.yield %of0_buff_0 : memref<16xi32>
}
func.call @some_work(%1) : (memref<16xi32>) -> ()
aie.use_lock(%of0_cons_lock, Release, 1)
%2 = memref.load %buffer_0_2[%c0_0] : memref<1xindex>
%c1_2 = arith.constant 1 : index
%3 = arith.addi %2, %c1_2 : index
%4 = arith.remsi %3, %c2 : index
memref.store %4, %buffer_0_2[%c0_0] : memref<1xindex>
}
aie.end
}
```
This lowering can be enabled for each core by setting the `dynamic_objfifo_lowering` attribute of the CoreOp to true, or enabled for all the cores in the design at once by setting the `dynamic-objFifos` flag of aiecc (which is then passed to the --aie-objectFifo-stateful-transform lowering pass).

ObjectFIFOs can be established between tiles on the shim row and AIE tiles in order to bring data in from or out to external memory locations. These external memory locations are pointed to using AIE.external_buffer operations and they need to be explicitly registered to an objectFIFO so that it knows where the data has been allocated externally (in this case, the objectFIFO lowering will only allocate memory elements required by AIE tiles):
```
module @objectFIFO {
Expand Down
6 changes: 5 additions & 1 deletion include/aie/Dialect/AIE/IR/AIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ def AIE_CoreOp: AIE_Op<"core", [
ins Index:$tile,
DefaultValuedAttr<AIEI32Attr, "0x400">:$stack_size,
OptionalAttr<StrAttr>:$link_with,
OptionalAttr<StrAttr>:$elf_file
OptionalAttr<StrAttr>:$elf_file,
OptionalAttr<BoolAttr>:$dynamic_objfifo_lowering
);
let summary = "Declare a core module";
let description = [{
Expand All @@ -310,6 +311,9 @@ def AIE_CoreOp: AIE_Op<"core", [
are always stored in the local core memory, to avoid conflicts with static data allocations
in other cores.

This op has an optional `dynamic_objfifo_lowering` attribute, to finely control whether the
objectfifos in this core should be lowered using the dynamic runtime lowering.

Examples:
```
%tile = aie.tile(1, 1)
Expand Down
5 changes: 5 additions & 0 deletions include/aie/Dialect/AIE/Transforms/AIEPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@ def AIEObjectFifoStatefulTransform : Pass<"aie-objectFifo-stateful-transform", "
"mlir::memref::MemRefDialect",
"xilinx::AIE::AIEDialect",
];

let options = [
Option<"clDynamicObjectFifos", "dynamic-objFifos", "bool", /*default=*/"false",
"Flag to enable dynamic object fifo lowering in cores instead of loop unrolling.">
];
}

def AIEObjectFifoRegisterProcess : Pass<"aie-register-objectFifos", "DeviceOp"> {
Expand Down
183 changes: 179 additions & 4 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "mlir/Analysis/TopologicalSortUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"
Expand Down Expand Up @@ -852,6 +853,7 @@ struct AIEObjectFifoStatefulTransformPass
lcm = i * lcm / std::gcd(i, lcm);
return lcm;
}

// Function that unrolls for-loops that contain objectFifo operations.
LogicalResult unrollForLoops(DeviceOp &device, OpBuilder &builder,
std::set<TileOp> objectFifoTiles) {
Expand Down Expand Up @@ -967,6 +969,156 @@ struct AIEObjectFifoStatefulTransformPass
return success();
}

// Function that generates the IR to update runtime state of objectfifo
// accesses. Called by dynamicGlobalObjectFifos().
void updateGlobalNextIndex(OpBuilder &builder, ObjectFifoReleaseOp relOp,
BufferOp globalNextIndex, arith::ConstantOp index,
arith::ConstantOp size) {
builder.setInsertionPointAfter(relOp);
Value oldCounter = builder.create<memref::LoadOp>(
builder.getUnknownLoc(), globalNextIndex,
ValueRange(ArrayRef({index.getResult()})));
Value val = builder.create<arith::ConstantOp>(
oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize()));
Value sum = builder.create<arith::AddIOp>(val.getLoc(), oldCounter, val);
Value newCounter = builder.create<arith::RemSIOp>(sum.getLoc(), sum, size);
builder.create<memref::StoreOp>(size.getLoc(), newCounter, globalNextIndex,
ValueRange(ArrayRef({index.getResult()})));
}

// Function that generates the IR for objectfifo accesses to be handled at
// runtime.
LogicalResult dynamicGlobalObjectFifos(DeviceOp &device, OpBuilder &builder,
std::set<TileOp> objectFifoTiles) {
for (auto coreOp : device.getOps<CoreOp>()) {
if (objectFifoTiles.count(coreOp.getTileOp()) <= 0)
continue;
if (objectFifoTiles.count(coreOp.getTileOp()) > 0) {
// For each core: count the number of objectFifos and create
// a global buffer just before the core to track index of
// next object to access.
// !! NOTE !! objectFifos with same producer / consumer tile
// need two counters (accessed based on the ObjectFifoPort)
std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>, int> fifoSizes;
coreOp.walk([&](ObjectFifoAcquireOp acqOp) {
ObjectFifoCreateOp op = acqOp.getObjectFifo();
ObjectFifoPort port = acqOp.getPort();
if (fifoSizes.find({op, port}) == fifoSizes.end())
fifoSizes[{op, port}] = op.size();
});
builder.setInsertionPoint(coreOp);
auto memrefTy =
MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
builder.getIndexType());
auto globalNextIndex = builder.create<BufferOp>(
builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
/*sym_name*/ nullptr, /*address*/ nullptr,
/*initial_value*/ nullptr, /*mem_bank*/ nullptr);

// Initialize all counters in the global buffers to 0.
// Also, keep a map of the ConstantOps for the indices per OF
// and a map with the ConstantOps for the sizes per OF.
std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
arith::ConstantOp>
globalIndices;
std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
arith::ConstantOp>
constantSizes;
int index = 0;
builder.setInsertionPointToStart(&(coreOp.getBody().front()));
Value initVal = builder.create<arith::ConstantOp>(
builder.getUnknownLoc(), builder.getIndexAttr(0));
for (auto i : fifoSizes) {
auto indexOp = builder.create<arith::ConstantOp>(
initVal.getLoc(), builder.getIndexAttr(index));
globalIndices[i.first] = indexOp;
index++;
auto size = builder.create<arith::ConstantOp>(
indexOp.getLoc(), builder.getIndexAttr(i.second));
constantSizes[i.first] = size;
builder.create<memref::StoreOp>(
size.getLoc(), initVal, globalNextIndex,
ValueRange(ArrayRef({indexOp.getResult()})));
}

// Walk the code:
// - after each ObjectFifoReleaseOp:
// - globalNextIndex: add #rel modulo objfifo depth
// - before each ObjectFifoAcquireOp:
// - globalNextIndex: load index and use it to index_switch (one
// IndexSwithOp per AccessOp)
WalkResult res = coreOp.walk([&](Operation *op) {
if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
ObjectFifoCreateOp createOp = relOp.getObjectFifo();
ObjectFifoPort port = relOp.getPort();
updateGlobalNextIndex(builder, relOp, globalNextIndex,
globalIndices[{createOp, port}],
constantSizes[{createOp, port}]);
}
if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
std::vector<ObjectFifoSubviewAccessOp> accessOps;
for (auto u : acqOp->getUsers())
if (auto accessOp = dyn_cast<ObjectFifoSubviewAccessOp>(u))
accessOps.push_back(accessOp);

for (auto accessOp : accessOps) {
ObjectFifoCreateOp createOp = acqOp.getObjectFifo();
ObjectFifoPort port = acqOp.getPort();

// Single switch case
if (fifoSizes[{createOp, port}] == 1)
return WalkResult::advance();

// Create a switch for each subview access
builder.setInsertionPointAfter(accessOp);
auto switchIndex = builder.create<memref::LoadOp>(
builder.getUnknownLoc(), globalNextIndex,
ValueRange(
ArrayRef({globalIndices[{createOp, port}].getResult()})));
unsigned caseRegionCounts = fifoSizes[{createOp, port}];
SmallVector<int64_t, 4> caseValues;
for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
caseValues.push_back(i);
}
auto cases =
DenseI64ArrayAttr::get(builder.getContext(), caseValues);
auto switchOp = builder.create<scf::IndexSwitchOp>(
switchIndex.getLoc(),
TypeRange({buffersPerFifo[createOp][0].getType()}),
switchIndex, cases, caseRegionCounts);
// Create default case of IndexSwitchOp
builder.createBlock(&switchOp.getDefaultRegion());
auto bufferIndex = (accessOp.getIndex()) % createOp.size();
builder.setInsertionPointToStart(&(switchOp.getDefaultBlock()));
builder.create<scf::YieldOp>(
builder.getUnknownLoc(),
buffersPerFifo[createOp][bufferIndex].getResult());
for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
// Create other cases of IndexSwitchOp
builder.createBlock(&switchOp.getCaseRegions()[i]);
builder.setInsertionPoint(&switchOp.getCaseBlock(i),
switchOp.getCaseBlock(i).begin());
int bufferToBeAccesed =
(accessOp.getIndex() + i) % fifoSizes[{createOp, port}];
builder.create<scf::YieldOp>(
switchOp.getCaseRegions()[i].getLoc(),
buffersPerFifo[createOp][bufferToBeAccesed].getResult());
}

// Replace all uses of accessed objectfifo buffers with
// results of switchOps
accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0));
}
}
return WalkResult::advance();
});
if (res.wasInterrupted())
return failure();
}
}
return success();
}

/// Function used to create a UseLockOp based on input parameters.
/// acc is an accumulator map that tracks the indices of the next locks to
/// acquire (or release). Uses op to find index of acc for next lockID.
Expand Down Expand Up @@ -1240,7 +1392,9 @@ struct AIEObjectFifoStatefulTransformPass
// - Create objectFifo buffers and locks.
// - Populate a list of tiles containing objectFifos for later processing of
// the acquires/releases (uses of the FIFO).
// - Global release counter tracker to keep track of the objectFifo state
//===------------------------------------------------------------------===//

for (auto createOp : device.getOps<ObjectFifoCreateOp>()) {
int share_direction = 0;
bool shared = !requiresDMAs(createOp, share_direction);
Expand Down Expand Up @@ -1343,10 +1497,31 @@ struct AIEObjectFifoStatefulTransformPass
}

//===------------------------------------------------------------------===//
// Unroll for loops
// Statically unroll for loops or use dynamic objectFifos
//===------------------------------------------------------------------===//
if (failed(unrollForLoops(device, builder, objectFifoTiles))) {
signalPassFailure();
if (clDynamicObjectFifos) {
if (failed(dynamicGlobalObjectFifos(device, builder, objectFifoTiles)))
signalPassFailure();
} else {
std::set<TileOp> dynamicTiles;
std::set<TileOp> unrollTiles;
for (auto c : device.getOps<CoreOp>()) {
TileOp t = c.getTileOp();
if (objectFifoTiles.count(t) > 0) {
if (c.getDynamicObjfifoLowering().has_value()) {
if (c.getDynamicObjfifoLowering().value())
dynamicTiles.insert(t);
else
unrollTiles.insert(t);
} else {
unrollTiles.insert(t);
}
}
}
if (failed(dynamicGlobalObjectFifos(device, builder, dynamicTiles)))
signalPassFailure();
if (failed(unrollForLoops(device, builder, unrollTiles)))
signalPassFailure();
}

//===------------------------------------------------------------------===//
Expand Down Expand Up @@ -1559,4 +1734,4 @@ struct AIEObjectFifoStatefulTransformPass
std::unique_ptr<OperationPass<DeviceOp>>
AIE::createAIEObjectFifoStatefulTransformPass() {
return std::make_unique<AIEObjectFifoStatefulTransformPass>();
}
}
2 changes: 2 additions & 0 deletions mlir_tutorials/tutorial-3/objectFifo_ver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ aie-opt --aie-canonicalize-device <path to mlir source file> | aie-opt --aie-obj
```
We note that in the above command there are actually two lowering passes being applied. The first pass will ensure that there exists a target device configuration in the source code, or add one if there isn't. That is the same pass that is used by `aiecc.py`, but needs to be explicitly called when running lowering passes separately. Further details on the device configuration can be found in [tutorial-2b](../../tutorial-2/tutorial-2b/).

Two different lowerings currently exist for objectFifo operations: one is a static lowering that keeps track of acquire / release operations at compile-time and unrolls for-loops to ensure the proper buffer / lock pair is accessed each iteration; the other is a runtime solution which keeps track of acquire / release operations in a global state buffer which is then read to determine the correct buffer to access each iteration through an scf.IndexSwitchOp. Additional details can be found in the [Design Patterns](../../../docs/AIEDesignPatterns.md).

## <ins>Tutorial 3 Lab </ins>

1. Read through the [/objectFifo_ver/aie.mlir](aie.mlir) design. In which tile and its local memory will the objectFifo lowering generate the buffer and its lock? <img src="../../images/answer1.jpg" title="On even rows tiles have local memories to their left, so the shared memory is that of tile (2,4). That is where the lowering will generate the shared buffer and lock." height=25>
Expand Down
66 changes: 66 additions & 0 deletions programming_examples/dynamic_object_fifo/nested_loops/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
#
##===----------------------------------------------------------------------===##

# ---

# The following environment variables that point to the Xilinx runtime (XRT)
# should be set up by an environment setup script already.
XILINX_XRT?=/opt/xilinx/xrt
XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)

# ---

srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
XILINX_XRT_LIB?=${XILINX_XRT}/lib

CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include
XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
XRT_LIBS=-lxrt_coreutil
CXX=g++-13 -ggdb

#mlir_target?=build/aie.mlir
xclbin_target?=build/final.xclbin
insts_target?=build/insts.txt
host_target?=build/test

.PHONY: all
all: ${xclbin_target} ${host_target}

build/aie.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
python3 $< > $@

build/kernel.o: ${srcdir}/kernel.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}

${xclbin_target}: build/aie.mlir build/kernel.o
mkdir -p ${@D}
cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}

${host_target}: ${srcdir}/test.cpp ${xclbin_target}
mkdir -p ${@D}
${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}

.PHONY: run
run: ${host_target}
./${host_target}

xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh
.PHONY: sign
sign: ${xclbin_target}
${xclbin_sign} -dev Phoenix -xclbin $<

.PHONY: clean
clean:
-rm -r build
Loading

0 comments on commit fa85462

Please sign in to comment.