From fa8546289e2e6c015867b379da9af33fe4be220b Mon Sep 17 00:00:00 2001 From: Andra Bisca Date: Tue, 15 Oct 2024 17:14:05 +0200 Subject: [PATCH] Object FIFO: Introduce new dynamic lowering (#1798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Rösti Co-authored-by: AndraBisca Co-authored-by: Pranathi Vasireddy Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/AIEDesignPatterns.md | 45 +++- include/aie/Dialect/AIE/IR/AIEOps.td | 6 +- .../aie/Dialect/AIE/Transforms/AIEPasses.td | 5 + .../AIEObjectFifoStatefulTransform.cpp | 183 ++++++++++++- .../tutorial-3/objectFifo_ver/README.md | 2 + .../dynamic_object_fifo/nested_loops/Makefile | 66 +++++ .../nested_loops/README.md | 40 +++ .../dynamic_object_fifo/nested_loops/aie2.py | 67 +++++ .../nested_loops/kernel.cc | 22 ++ .../nested_loops/run_makefile.lit | 9 + .../dynamic_object_fifo/nested_loops/test.cpp | 126 +++++++++ .../dynamic_object_fifo/ping_pong/.gitignore | 1 + .../dynamic_object_fifo/ping_pong/Makefile | 66 +++++ .../dynamic_object_fifo/ping_pong/README.md | 40 +++ .../dynamic_object_fifo/ping_pong/aie2.py | 64 +++++ .../dynamic_object_fifo/ping_pong/kernel.cc | 22 ++ .../ping_pong/run_makefile.lit | 9 + .../dynamic_object_fifo/ping_pong/test.cpp | 103 ++++++++ .../dynamic_object_fifo/reduction/Makefile | 66 +++++ .../dynamic_object_fifo/reduction/README.md | 40 +++ .../dynamic_object_fifo/reduction/aie2.py | 67 +++++ .../dynamic_object_fifo/reduction/kernel.cc | 24 ++ .../reduction/run_makefile.lit | 9 + .../dynamic_object_fifo/reduction/test.cpp | 123 +++++++++ .../sliding_window/Makefile | 66 +++++ .../sliding_window/README.md | 44 ++++ .../sliding_window/aie2.py | 76 ++++++ .../sliding_window/aie2_if_else.py | 74 ++++++ .../sliding_window/kernel.cc | 24 ++ .../sliding_window/run_makefile.lit | 9 + .../sliding_window/test.cpp | 125 +++++++++ .../two_core_sliding_window/Makefile | 66 +++++ .../two_core_sliding_window/README.md | 40 +++ .../two_core_sliding_window/aie2.py | 90 +++++++ .../two_core_sliding_window/kernel.cc | 38 +++ .../two_core_sliding_window/run_makefile.lit | 9 + .../two_core_sliding_window/test.cpp | 125 +++++++++ python/compiler/aiecc/cl_arguments.py | 7 + python/compiler/aiecc/main.py | 8 +- python/dialects/aie.py | 9 +- .../dynamic_lowering_flag_test.mlir | 137 ++++++++++ .../dynamic_lowering_test.mlir | 244 ++++++++++++++++++ 42 files changed, 2385 insertions(+), 11 deletions(-) create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/Makefile create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/README.md create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/aie2.py create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/kernel.cc create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/run_makefile.lit create mode 100644 programming_examples/dynamic_object_fifo/nested_loops/test.cpp create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/.gitignore create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/Makefile create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/README.md create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/aie2.py create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/kernel.cc create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/run_makefile.lit create mode 100644 programming_examples/dynamic_object_fifo/ping_pong/test.cpp create mode 100644 programming_examples/dynamic_object_fifo/reduction/Makefile create mode 100644 programming_examples/dynamic_object_fifo/reduction/README.md create mode 100644 programming_examples/dynamic_object_fifo/reduction/aie2.py create mode 100644 programming_examples/dynamic_object_fifo/reduction/kernel.cc create mode 100644 programming_examples/dynamic_object_fifo/reduction/run_makefile.lit create mode 100644 programming_examples/dynamic_object_fifo/reduction/test.cpp create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/Makefile create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/README.md create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/aie2.py create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/aie2_if_else.py create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/kernel.cc create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/run_makefile.lit create mode 100644 programming_examples/dynamic_object_fifo/sliding_window/test.cpp create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/Makefile create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/README.md create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/aie2.py create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/kernel.cc create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/run_makefile.lit create mode 100644 programming_examples/dynamic_object_fifo/two_core_sliding_window/test.cpp create mode 100644 test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir create mode 100644 test/objectFifo-stateful-transform/dynamic_lowering_test.mlir diff --git a/docs/AIEDesignPatterns.md b/docs/AIEDesignPatterns.md index 6838440a29..b72d2fca03 100644 --- a/docs/AIEDesignPatterns.md +++ b/docs/AIEDesignPatterns.md @@ -393,7 +393,9 @@ Operations can be performed on the objectFIFO in the cores: elements can be acqu } ``` -For correct execution, loops that contain objectFIFO operations must be unrolled based on objectFIFO size; the previous code in core12 becomes: +For correct execution, objectfifo operations must be lowered such that each iteration of execution, new elements are accessed (based on acquire / release patterns). Two different lowering techniques are described below: + +In the default lowering, loops that contain objectFIFO operations are unrolled based on objectFIFO size; the previous code in core12 becomes: ``` %core12 = AIE.core(%tile12) { %c0 = arith.constant 0 : index @@ -416,6 +418,47 @@ For correct execution, loops that contain objectFIFO operations must be unrolled } ``` +Another lowering technique generates MLIR operations that ensure the acquire / release patterns are taken into account at runtime and their effects are stored in a global buffer. This global state buffer is then used to correctly access objectfifos using a SCF.IndexSwitchOps; the previous code in core12 becomes: +``` +%of0_buff_0 = aie.buffer(%tile_0_2) {sym_name = "of0_buff_0"} : memref<16xi32> +%of0_buff_1 = aie.buffer(%tile_0_2) {sym_name = "of0_buff_1"} : memref<16xi32> +%of0_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "of0_prod_lock"} +%of0_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "of0_cons_lock"} +%buffer_0_2 = aie.buffer(%tile_0_2) : memref<1xindex> +%core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c0_0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + memref.store %c0, %buffer_0_2[%c0_0] : memref<1xindex> + %c0_1 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12 = arith.constant 12 : index + scf.for %arg0 = %c0_1 to %c12 step %c1 { + aie.use_lock(%of0_prod_lock, AcquireGreaterEqual, 1) + %0 = memref.load %buffer_0_2[%c0_0] : memref<1xindex> + %1 = scf.index_switch %0 -> memref<16xi32> + case 0 { + scf.yield %of0_buff_0 : memref<16xi32> + } + case 1 { + scf.yield %of0_buff_1 : memref<16xi32> + } + default { + scf.yield %of0_buff_0 : memref<16xi32> + } + func.call @some_work(%1) : (memref<16xi32>) -> () + aie.use_lock(%of0_cons_lock, Release, 1) + %2 = memref.load %buffer_0_2[%c0_0] : memref<1xindex> + %c1_2 = arith.constant 1 : index + %3 = arith.addi %2, %c1_2 : index + %4 = arith.remsi %3, %c2 : index + memref.store %4, %buffer_0_2[%c0_0] : memref<1xindex> + } + aie.end +} +``` +This lowering can be enabled for each core by setting the `dynamic_objfifo_lowering` attribute of the CoreOp to true, or enabled for all the cores in the design at once by setting the `dynamic-objFifos` flag of aiecc (which is then passed to the --aie-objectFifo-stateful-transform lowering pass). + ObjectFIFOs can be established between tiles on the shim row and AIE tiles in order to bring data in from or out to external memory locations. These external memory locations are pointed to using AIE.external_buffer operations and they need to be explicitly registered to an objectFIFO so that it knows where the data has been allocated externally (in this case, the objectFIFO lowering will only allocate memory elements required by AIE tiles): ``` module @objectFIFO { diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 629bd99cfc..bcff1fd713 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -295,7 +295,8 @@ def AIE_CoreOp: AIE_Op<"core", [ ins Index:$tile, DefaultValuedAttr:$stack_size, OptionalAttr:$link_with, - OptionalAttr:$elf_file + OptionalAttr:$elf_file, + OptionalAttr:$dynamic_objfifo_lowering ); let summary = "Declare a core module"; let description = [{ @@ -310,6 +311,9 @@ def AIE_CoreOp: AIE_Op<"core", [ are always stored in the local core memory, to avoid conflicts with static data allocations in other cores. + This op has an optional `dynamic_objfifo_lowering` attribute, to finely control whether the + objectfifos in this core should be lowered using the dynamic runtime lowering. + Examples: ``` %tile = aie.tile(1, 1) diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.td b/include/aie/Dialect/AIE/Transforms/AIEPasses.td index 525ace0df0..8453c4cd0c 100644 --- a/include/aie/Dialect/AIE/Transforms/AIEPasses.td +++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.td @@ -200,6 +200,11 @@ def AIEObjectFifoStatefulTransform : Pass<"aie-objectFifo-stateful-transform", " "mlir::memref::MemRefDialect", "xilinx::AIE::AIEDialect", ]; + + let options = [ + Option<"clDynamicObjectFifos", "dynamic-objFifos", "bool", /*default=*/"false", + "Flag to enable dynamic object fifo lowering in cores instead of loop unrolling."> + ]; } def AIEObjectFifoRegisterProcess : Pass<"aie-register-objectFifos", "DeviceOp"> { diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index fdabf47423..4e40a0b501 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -15,6 +15,7 @@ #include "mlir/Analysis/TopologicalSortUtils.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Utils/Utils.h" @@ -852,6 +853,7 @@ struct AIEObjectFifoStatefulTransformPass lcm = i * lcm / std::gcd(i, lcm); return lcm; } + // Function that unrolls for-loops that contain objectFifo operations. LogicalResult unrollForLoops(DeviceOp &device, OpBuilder &builder, std::set objectFifoTiles) { @@ -967,6 +969,156 @@ struct AIEObjectFifoStatefulTransformPass return success(); } + // Function that generates the IR to update runtime state of objectfifo + // accesses. Called by dynamicGlobalObjectFifos(). + void updateGlobalNextIndex(OpBuilder &builder, ObjectFifoReleaseOp relOp, + BufferOp globalNextIndex, arith::ConstantOp index, + arith::ConstantOp size) { + builder.setInsertionPointAfter(relOp); + Value oldCounter = builder.create( + builder.getUnknownLoc(), globalNextIndex, + ValueRange(ArrayRef({index.getResult()}))); + Value val = builder.create( + oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize())); + Value sum = builder.create(val.getLoc(), oldCounter, val); + Value newCounter = builder.create(sum.getLoc(), sum, size); + builder.create(size.getLoc(), newCounter, globalNextIndex, + ValueRange(ArrayRef({index.getResult()}))); + } + + // Function that generates the IR for objectfifo accesses to be handled at + // runtime. + LogicalResult dynamicGlobalObjectFifos(DeviceOp &device, OpBuilder &builder, + std::set objectFifoTiles) { + for (auto coreOp : device.getOps()) { + if (objectFifoTiles.count(coreOp.getTileOp()) <= 0) + continue; + if (objectFifoTiles.count(coreOp.getTileOp()) > 0) { + // For each core: count the number of objectFifos and create + // a global buffer just before the core to track index of + // next object to access. + // !! NOTE !! objectFifos with same producer / consumer tile + // need two counters (accessed based on the ObjectFifoPort) + std::map, int> fifoSizes; + coreOp.walk([&](ObjectFifoAcquireOp acqOp) { + ObjectFifoCreateOp op = acqOp.getObjectFifo(); + ObjectFifoPort port = acqOp.getPort(); + if (fifoSizes.find({op, port}) == fifoSizes.end()) + fifoSizes[{op, port}] = op.size(); + }); + builder.setInsertionPoint(coreOp); + auto memrefTy = + MemRefType::get(SmallVector{(int64_t)fifoSizes.size()}, + builder.getIndexType()); + auto globalNextIndex = builder.create( + builder.getUnknownLoc(), memrefTy, coreOp.getTile(), + /*sym_name*/ nullptr, /*address*/ nullptr, + /*initial_value*/ nullptr, /*mem_bank*/ nullptr); + + // Initialize all counters in the global buffers to 0. + // Also, keep a map of the ConstantOps for the indices per OF + // and a map with the ConstantOps for the sizes per OF. + std::map, + arith::ConstantOp> + globalIndices; + std::map, + arith::ConstantOp> + constantSizes; + int index = 0; + builder.setInsertionPointToStart(&(coreOp.getBody().front())); + Value initVal = builder.create( + builder.getUnknownLoc(), builder.getIndexAttr(0)); + for (auto i : fifoSizes) { + auto indexOp = builder.create( + initVal.getLoc(), builder.getIndexAttr(index)); + globalIndices[i.first] = indexOp; + index++; + auto size = builder.create( + indexOp.getLoc(), builder.getIndexAttr(i.second)); + constantSizes[i.first] = size; + builder.create( + size.getLoc(), initVal, globalNextIndex, + ValueRange(ArrayRef({indexOp.getResult()}))); + } + + // Walk the code: + // - after each ObjectFifoReleaseOp: + // - globalNextIndex: add #rel modulo objfifo depth + // - before each ObjectFifoAcquireOp: + // - globalNextIndex: load index and use it to index_switch (one + // IndexSwithOp per AccessOp) + WalkResult res = coreOp.walk([&](Operation *op) { + if (auto relOp = dyn_cast(op)) { + ObjectFifoCreateOp createOp = relOp.getObjectFifo(); + ObjectFifoPort port = relOp.getPort(); + updateGlobalNextIndex(builder, relOp, globalNextIndex, + globalIndices[{createOp, port}], + constantSizes[{createOp, port}]); + } + if (auto acqOp = dyn_cast(op)) { + std::vector accessOps; + for (auto u : acqOp->getUsers()) + if (auto accessOp = dyn_cast(u)) + accessOps.push_back(accessOp); + + for (auto accessOp : accessOps) { + ObjectFifoCreateOp createOp = acqOp.getObjectFifo(); + ObjectFifoPort port = acqOp.getPort(); + + // Single switch case + if (fifoSizes[{createOp, port}] == 1) + return WalkResult::advance(); + + // Create a switch for each subview access + builder.setInsertionPointAfter(accessOp); + auto switchIndex = builder.create( + builder.getUnknownLoc(), globalNextIndex, + ValueRange( + ArrayRef({globalIndices[{createOp, port}].getResult()}))); + unsigned caseRegionCounts = fifoSizes[{createOp, port}]; + SmallVector caseValues; + for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) { + caseValues.push_back(i); + } + auto cases = + DenseI64ArrayAttr::get(builder.getContext(), caseValues); + auto switchOp = builder.create( + switchIndex.getLoc(), + TypeRange({buffersPerFifo[createOp][0].getType()}), + switchIndex, cases, caseRegionCounts); + // Create default case of IndexSwitchOp + builder.createBlock(&switchOp.getDefaultRegion()); + auto bufferIndex = (accessOp.getIndex()) % createOp.size(); + builder.setInsertionPointToStart(&(switchOp.getDefaultBlock())); + builder.create( + builder.getUnknownLoc(), + buffersPerFifo[createOp][bufferIndex].getResult()); + for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) { + // Create other cases of IndexSwitchOp + builder.createBlock(&switchOp.getCaseRegions()[i]); + builder.setInsertionPoint(&switchOp.getCaseBlock(i), + switchOp.getCaseBlock(i).begin()); + int bufferToBeAccesed = + (accessOp.getIndex() + i) % fifoSizes[{createOp, port}]; + builder.create( + switchOp.getCaseRegions()[i].getLoc(), + buffersPerFifo[createOp][bufferToBeAccesed].getResult()); + } + + // Replace all uses of accessed objectfifo buffers with + // results of switchOps + accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0)); + } + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) + return failure(); + } + } + return success(); + } + /// Function used to create a UseLockOp based on input parameters. /// acc is an accumulator map that tracks the indices of the next locks to /// acquire (or release). Uses op to find index of acc for next lockID. @@ -1240,7 +1392,9 @@ struct AIEObjectFifoStatefulTransformPass // - Create objectFifo buffers and locks. // - Populate a list of tiles containing objectFifos for later processing of // the acquires/releases (uses of the FIFO). + // - Global release counter tracker to keep track of the objectFifo state //===------------------------------------------------------------------===// + for (auto createOp : device.getOps()) { int share_direction = 0; bool shared = !requiresDMAs(createOp, share_direction); @@ -1343,10 +1497,31 @@ struct AIEObjectFifoStatefulTransformPass } //===------------------------------------------------------------------===// - // Unroll for loops + // Statically unroll for loops or use dynamic objectFifos //===------------------------------------------------------------------===// - if (failed(unrollForLoops(device, builder, objectFifoTiles))) { - signalPassFailure(); + if (clDynamicObjectFifos) { + if (failed(dynamicGlobalObjectFifos(device, builder, objectFifoTiles))) + signalPassFailure(); + } else { + std::set dynamicTiles; + std::set unrollTiles; + for (auto c : device.getOps()) { + TileOp t = c.getTileOp(); + if (objectFifoTiles.count(t) > 0) { + if (c.getDynamicObjfifoLowering().has_value()) { + if (c.getDynamicObjfifoLowering().value()) + dynamicTiles.insert(t); + else + unrollTiles.insert(t); + } else { + unrollTiles.insert(t); + } + } + } + if (failed(dynamicGlobalObjectFifos(device, builder, dynamicTiles))) + signalPassFailure(); + if (failed(unrollForLoops(device, builder, unrollTiles))) + signalPassFailure(); } //===------------------------------------------------------------------===// @@ -1559,4 +1734,4 @@ struct AIEObjectFifoStatefulTransformPass std::unique_ptr> AIE::createAIEObjectFifoStatefulTransformPass() { return std::make_unique(); -} +} \ No newline at end of file diff --git a/mlir_tutorials/tutorial-3/objectFifo_ver/README.md b/mlir_tutorials/tutorial-3/objectFifo_ver/README.md index 3fe124ad16..92067996ad 100755 --- a/mlir_tutorials/tutorial-3/objectFifo_ver/README.md +++ b/mlir_tutorials/tutorial-3/objectFifo_ver/README.md @@ -58,6 +58,8 @@ aie-opt --aie-canonicalize-device | aie-opt --aie-obj ``` We note that in the above command there are actually two lowering passes being applied. The first pass will ensure that there exists a target device configuration in the source code, or add one if there isn't. That is the same pass that is used by `aiecc.py`, but needs to be explicitly called when running lowering passes separately. Further details on the device configuration can be found in [tutorial-2b](../../tutorial-2/tutorial-2b/). +Two different lowerings currently exist for objectFifo operations: one is a static lowering that keeps track of acquire / release operations at compile-time and unrolls for-loops to ensure the proper buffer / lock pair is accessed each iteration; the other is a runtime solution which keeps track of acquire / release operations in a global state buffer which is then read to determine the correct buffer to access each iteration through an scf.IndexSwitchOp. Additional details can be found in the [Design Patterns](../../../docs/AIEDesignPatterns.md). + ## Tutorial 3 Lab 1. Read through the [/objectFifo_ver/aie.mlir](aie.mlir) design. In which tile and its local memory will the objectFifo lowering generate the buffer and its lock? diff --git a/programming_examples/dynamic_object_fifo/nested_loops/Makefile b/programming_examples/dynamic_object_fifo/nested_loops/Makefile new file mode 100644 index 0000000000..4e423e1df1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +# --- + +# The following environment variables that point to the Xilinx runtime (XRT) +# should be set up by an environment setup script already. +XILINX_XRT?=/opt/xilinx/xrt +XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../) + +# --- + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +XILINX_XRT_INCLUDE?=${XILINX_XRT}/include +XILINX_XRT_LIB?=${XILINX_XRT}/lib + +CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include +XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB} +XRT_LIBS=-lxrt_coreutil +CXX=g++-13 -ggdb + +#mlir_target?=build/aie.mlir +xclbin_target?=build/final.xclbin +insts_target?=build/insts.txt +host_target?=build/test + +.PHONY: all +all: ${xclbin_target} ${host_target} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +${xclbin_target}: build/aie.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%} + +${host_target}: ${srcdir}/test.cpp ${xclbin_target} + mkdir -p ${@D} + ${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS} + +.PHONY: run +run: ${host_target} + ./${host_target} + +xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh +.PHONY: sign +sign: ${xclbin_target} + ${xclbin_sign} -dev Phoenix -xclbin $< + +.PHONY: clean +clean: + -rm -r build diff --git a/programming_examples/dynamic_object_fifo/nested_loops/README.md b/programming_examples/dynamic_object_fifo/nested_loops/README.md new file mode 100644 index 0000000000..d1184bb09a --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/README.md @@ -0,0 +1,40 @@ + + +# Dynamic Object FIFO - Nested Loops + +Contains an example of what a ObjectFIFO lowering may look like that does not statically unroll loops, but instead chooses the buffers dynamically by using MLIR IndexSwitchOps and by keeping the ObjectFIFO state in the tile's local memory. + +This design implements the communication from external memory to a compute tile in the AIE array, and back. The input data consists of five rows of 10xi32 tensors. Every iteration the compute tile acquires one input row and, in a second nested loop, it applies a simple passthrough kernel on the input data. The nested loop executes five times on each input row, effectively generating an output tensor five times bigger than the size of the input tensor. + +## Source Files Overview + +1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +2. `kernel.cc`: A C++ implementation of a simple passthrough operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html). The source can be found [here](../../../aie_kernels/aie2/add.cc). + +3. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. + + +## Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +``` + +To run the design: + +``` +make run +``` diff --git a/programming_examples/dynamic_object_fifo/nested_loops/aie2.py b/programming_examples/dynamic_object_fifo/nested_loops/aie2.py new file mode 100644 index 0000000000..7c5babedb3 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/aie2.py @@ -0,0 +1,67 @@ +# dynamic_object_fifo/nested_loops/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 50 +O = 250 +n_rows = 5 +dev = AIEDevice.npu1_1col +col = 0 + + +def nested_loops(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // n_rows, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, 2, memRef_ty) + of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + passthrough_10_i32 = external_func( + "passthrough_10_i32", inputs=[memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + for _ in range_(5): + elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) + for _ in range_(5): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + call(passthrough_10_i32, [elemIn, elemOut]) + of_out.release(ObjectFifoPort.Produce, 1) + of_in.release(ObjectFifoPort.Consume, 1) + + # To/from AIE-array data movement + tensor_ty = T.memref(N // n_rows, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, O]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +nested_loops() diff --git a/programming_examples/dynamic_object_fifo/nested_loops/kernel.cc b/programming_examples/dynamic_object_fifo/nested_loops/kernel.cc new file mode 100644 index 0000000000..91d5d56ea4 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/kernel.cc @@ -0,0 +1,22 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include + +template +void passthrough(const T_in *__restrict in, T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = in[i]; + } +} + +extern "C" { + +void passthrough_10_i32(const int *__restrict in, int *__restrict out) { + passthrough(in, out); +} +} diff --git a/programming_examples/dynamic_object_fifo/nested_loops/run_makefile.lit b/programming_examples/dynamic_object_fifo/nested_loops/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/dynamic_object_fifo/nested_loops/test.cpp b/programming_examples/dynamic_object_fifo/nested_loops/test.cpp new file mode 100644 index 0000000000..29a0c75193 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/nested_loops/test.cpp @@ -0,0 +1,126 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#ifndef XCLBIN +#define XCLBIN "build/final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "build/insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define INPUT_SIZE (50 * sizeof(int)) // in bytes +#define OUTPUT_SIZE (250 * sizeof(int)) // in bytes +#define WIDTH_SIZE (10 * sizeof(int)) // in bytes +#define WIDTH 10 +#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE +#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE + +#include "test_utils.h" + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_input = + xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_output = + xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + int *buf_input = bo_input.map(); + std::cout << std::endl << std::endl << "Input: " << std::endl; + for (int i = 0; i < INPUT_ROWS; i++) { + std::cout << "row " << i << " : "; + for (int j = 0; j < WIDTH; j++) { + buf_input[i * WIDTH + j] = i; + std::cout << buf_input[i * WIDTH + j] << " "; + } + std::cout << std::endl << std::endl; + } + int *buf_output = bo_output.map(); + memset(buf_output, 0, OUTPUT_SIZE); + + // Instruction buffer for DMA configuration + void *buf_instr = bo_instr.map(); + memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + bool pass = true; + std::cout << std::endl << "Output: " << std::endl; + int expected_output = 0; + int five_repetitions = 0; + for (int i = 0; i < OUTPUT_ROWS; i++) { + std::cout << "row " << i << std::endl; + if (five_repetitions == 5) { + expected_output++; + five_repetitions = 0; + } + for (int j = 0; j < WIDTH; j++) { + std::cout << "expected: " << expected_output << ", "; + std::cout << "got: " << buf_output[i * WIDTH + j] << std::endl; + pass &= buf_output[i * WIDTH + j] == expected_output; + } + std::cout << std::endl << std::endl; + five_repetitions++; + } + std::cout << std::endl << std::endl; + std::cout << (pass ? "PASS!" : "FAIL.") << std::endl; + + return 0; +} \ No newline at end of file diff --git a/programming_examples/dynamic_object_fifo/ping_pong/.gitignore b/programming_examples/dynamic_object_fifo/ping_pong/.gitignore new file mode 100644 index 0000000000..c795b054e5 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/programming_examples/dynamic_object_fifo/ping_pong/Makefile b/programming_examples/dynamic_object_fifo/ping_pong/Makefile new file mode 100644 index 0000000000..4e423e1df1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +# --- + +# The following environment variables that point to the Xilinx runtime (XRT) +# should be set up by an environment setup script already. +XILINX_XRT?=/opt/xilinx/xrt +XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../) + +# --- + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +XILINX_XRT_INCLUDE?=${XILINX_XRT}/include +XILINX_XRT_LIB?=${XILINX_XRT}/lib + +CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include +XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB} +XRT_LIBS=-lxrt_coreutil +CXX=g++-13 -ggdb + +#mlir_target?=build/aie.mlir +xclbin_target?=build/final.xclbin +insts_target?=build/insts.txt +host_target?=build/test + +.PHONY: all +all: ${xclbin_target} ${host_target} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +${xclbin_target}: build/aie.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%} + +${host_target}: ${srcdir}/test.cpp ${xclbin_target} + mkdir -p ${@D} + ${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS} + +.PHONY: run +run: ${host_target} + ./${host_target} + +xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh +.PHONY: sign +sign: ${xclbin_target} + ${xclbin_sign} -dev Phoenix -xclbin $< + +.PHONY: clean +clean: + -rm -r build diff --git a/programming_examples/dynamic_object_fifo/ping_pong/README.md b/programming_examples/dynamic_object_fifo/ping_pong/README.md new file mode 100644 index 0000000000..bd6b9374d2 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/README.md @@ -0,0 +1,40 @@ + + +# Dynamic Object FIFO - Ping Pong + +Contains an example of what a ObjectFIFO lowering may look like that does not statically unroll loops, but instead chooses the buffers dynamically by using MLIR IndexSwitchOps and by keeping the ObjectFIFO state in the tile's local memory. + +This design implements the communication from external memory to a compute tile in the AIE array, and back. The compute tile applies a simple passthrough kernel on incoming data before sending it back out. The communication levereges ping pong data movements. + +## Source Files Overview + +1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +2. `kernel.cc`: A C++ implementation of a simple passthrough operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html). The source can be found [here](../../../aie_kernels/aie2/add.cc). + +3. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. + + +## Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +``` + +To run the design: + +``` +make run +``` diff --git a/programming_examples/dynamic_object_fifo/ping_pong/aie2.py b/programming_examples/dynamic_object_fifo/ping_pong/aie2.py new file mode 100644 index 0000000000..995f6a100b --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/aie2.py @@ -0,0 +1,64 @@ +# dynamic_object_fifo/ping_pong/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 1024 +dev = AIEDevice.npu1_1col +col = 0 + + +def ping_pong(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // 16, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, 2, memRef_ty) + of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + passthrough_64_i32 = external_func( + "passthrough_64_i32", inputs=[memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + for _ in range_(sys.maxsize): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) + call(passthrough_64_i32, [elemIn, elemOut]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + tensor_ty = T.memref(N // 16, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +ping_pong() diff --git a/programming_examples/dynamic_object_fifo/ping_pong/kernel.cc b/programming_examples/dynamic_object_fifo/ping_pong/kernel.cc new file mode 100644 index 0000000000..6dfc2ef45b --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/kernel.cc @@ -0,0 +1,22 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include + +template +void passthrough(const T_in *__restrict in, T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = in[i]; + } +} + +extern "C" { + +void passthrough_64_i32(const int *__restrict in, int *__restrict out) { + passthrough(in, out); +} +} \ No newline at end of file diff --git a/programming_examples/dynamic_object_fifo/ping_pong/run_makefile.lit b/programming_examples/dynamic_object_fifo/ping_pong/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/dynamic_object_fifo/ping_pong/test.cpp b/programming_examples/dynamic_object_fifo/ping_pong/test.cpp new file mode 100644 index 0000000000..1716b3d7d9 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/ping_pong/test.cpp @@ -0,0 +1,103 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#ifndef XCLBIN +#define XCLBIN "build/final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "build/insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define INPUT_SIZE (1024 * sizeof(int)) // in bytes +#define OUTPUT_SIZE (1024 * sizeof(int)) // in bytes + +#include "test_utils.h" + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_input = + xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_output = + xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + int *buf_input = bo_input.map(); + for (int i = 0; i < INPUT_SIZE / sizeof(buf_input[0]); i++) { + buf_input[i] = i; + } + int *buf_output = bo_output.map(); + memset(buf_output, 0, OUTPUT_SIZE); + + // Instruction buffer for DMA configuration + void *buf_instr = bo_instr.map(); + memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + bool pass = true; + for (int i = 0; i < OUTPUT_SIZE / sizeof(buf_output[0]); i++) { + std::cout << buf_output[i] << " "; + pass &= buf_output[i] == i; + } + std::cout << std::endl; + std::cout << (pass ? "PASS!" : "FAIL.") << std::endl; + + return 0; +} diff --git a/programming_examples/dynamic_object_fifo/reduction/Makefile b/programming_examples/dynamic_object_fifo/reduction/Makefile new file mode 100644 index 0000000000..4e423e1df1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +# --- + +# The following environment variables that point to the Xilinx runtime (XRT) +# should be set up by an environment setup script already. +XILINX_XRT?=/opt/xilinx/xrt +XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../) + +# --- + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +XILINX_XRT_INCLUDE?=${XILINX_XRT}/include +XILINX_XRT_LIB?=${XILINX_XRT}/lib + +CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include +XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB} +XRT_LIBS=-lxrt_coreutil +CXX=g++-13 -ggdb + +#mlir_target?=build/aie.mlir +xclbin_target?=build/final.xclbin +insts_target?=build/insts.txt +host_target?=build/test + +.PHONY: all +all: ${xclbin_target} ${host_target} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +${xclbin_target}: build/aie.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%} + +${host_target}: ${srcdir}/test.cpp ${xclbin_target} + mkdir -p ${@D} + ${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS} + +.PHONY: run +run: ${host_target} + ./${host_target} + +xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh +.PHONY: sign +sign: ${xclbin_target} + ${xclbin_sign} -dev Phoenix -xclbin $< + +.PHONY: clean +clean: + -rm -r build diff --git a/programming_examples/dynamic_object_fifo/reduction/README.md b/programming_examples/dynamic_object_fifo/reduction/README.md new file mode 100644 index 0000000000..06be6303b2 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/README.md @@ -0,0 +1,40 @@ + + +# Dynamic Object FIFO - Reduction + +Contains an example of what a ObjectFIFO lowering may look like that does not statically unroll loops, but instead chooses the buffers dynamically by using MLIR IndexSwitchOps and by keeping the ObjectFIFO state in the tile's local memory. + +This design implements the communication from external memory to a compute tile in the AIE array, and back. The input data consists of ten rows of 10xi32 tensors. Every iteration the compute tile acquires two input rows and adds the values on each column. + +## Source Files Overview + +1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +2. `kernel.cc`: A C++ implementation of a simple add operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html). The source can be found [here](../../../aie_kernels/aie2/add.cc). + +3. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. + + +## Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +``` + +To run the design: + +``` +make run +``` diff --git a/programming_examples/dynamic_object_fifo/reduction/aie2.py b/programming_examples/dynamic_object_fifo/reduction/aie2.py new file mode 100644 index 0000000000..9457a81370 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/aie2.py @@ -0,0 +1,67 @@ +# dynamic_object_fifo/reduction/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 100 +O = 50 +n_rows = 10 +dev = AIEDevice.npu1_1col +col = 0 + + +def reduction(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // n_rows, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, [2, 4], memRef_ty) + of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + add_10_i32 = external_func( + "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + for _ in range_(sys.maxsize): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut]) + of_in.release(ObjectFifoPort.Consume, 2) + of_out.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + tensor_in_ty = T.memref(N, T.i32()) + tensor_out_ty = T.memref(O, T.i32()) + + @runtime_sequence(tensor_in_ty, tensor_out_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, O]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +reduction() diff --git a/programming_examples/dynamic_object_fifo/reduction/kernel.cc b/programming_examples/dynamic_object_fifo/reduction/kernel.cc new file mode 100644 index 0000000000..ddb474e102 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/kernel.cc @@ -0,0 +1,24 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include + +template +void add(const T_in *__restrict inA, const T_in *__restrict inB, + T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = inA[i] + inB[i]; + } +} + +extern "C" { + +void add_10_i32(const int *__restrict inA, const int *__restrict inB, + int *__restrict out) { + add(inA, inB, out); +} +} diff --git a/programming_examples/dynamic_object_fifo/reduction/run_makefile.lit b/programming_examples/dynamic_object_fifo/reduction/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/dynamic_object_fifo/reduction/test.cpp b/programming_examples/dynamic_object_fifo/reduction/test.cpp new file mode 100644 index 0000000000..90d992e943 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/reduction/test.cpp @@ -0,0 +1,123 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#ifndef XCLBIN +#define XCLBIN "build/final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "build/insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define INPUT_SIZE (100 * sizeof(int)) // in bytes +#define OUTPUT_SIZE (50 * sizeof(int)) // in bytes +#define WIDTH_SIZE (10 * sizeof(int)) // in bytes +#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE +#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE + +#include "test_utils.h" + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_input = + xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_output = + xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + int *buf_input = bo_input.map(); + std::cout << std::endl << "Input: " << std::endl; + for (int i = 0; i < INPUT_ROWS; i++) { + std::cout << "row " << i << " : "; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) { + buf_input[i * INPUT_ROWS + j] = i; + std::cout << buf_input[i * INPUT_ROWS + j] << " "; + } + std::cout << std::endl; + } + int *buf_output = bo_output.map(); + memset(buf_output, 0, OUTPUT_SIZE); + + // Instruction buffer for DMA configuration + void *buf_instr = bo_instr.map(); + memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + bool pass = true; + std::cout << std::endl << "Output: " << std::endl; + int row = 0; + for (int i = 0; i < OUTPUT_ROWS; i++) { + std::cout << "row " << i << " : " << std::endl; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) { + int expected_output = 0; + expected_output = + buf_input[(row + 1) * INPUT_ROWS] + buf_input[row * INPUT_ROWS]; + std::cout << "expected: " << expected_output << ", "; + std::cout << buf_output[i * INPUT_ROWS + j] << std::endl; + pass &= buf_output[i * INPUT_ROWS + j] == expected_output; + } + row = row + 2; + std::cout << std::endl; + } + std::cout << std::endl << std::endl; + std::cout << (pass ? "PASS!" : "FAIL.") << std::endl; + + return 0; +} diff --git a/programming_examples/dynamic_object_fifo/sliding_window/Makefile b/programming_examples/dynamic_object_fifo/sliding_window/Makefile new file mode 100644 index 0000000000..4e423e1df1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +# --- + +# The following environment variables that point to the Xilinx runtime (XRT) +# should be set up by an environment setup script already. +XILINX_XRT?=/opt/xilinx/xrt +XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../) + +# --- + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +XILINX_XRT_INCLUDE?=${XILINX_XRT}/include +XILINX_XRT_LIB?=${XILINX_XRT}/lib + +CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include +XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB} +XRT_LIBS=-lxrt_coreutil +CXX=g++-13 -ggdb + +#mlir_target?=build/aie.mlir +xclbin_target?=build/final.xclbin +insts_target?=build/insts.txt +host_target?=build/test + +.PHONY: all +all: ${xclbin_target} ${host_target} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +${xclbin_target}: build/aie.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%} + +${host_target}: ${srcdir}/test.cpp ${xclbin_target} + mkdir -p ${@D} + ${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS} + +.PHONY: run +run: ${host_target} + ./${host_target} + +xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh +.PHONY: sign +sign: ${xclbin_target} + ${xclbin_sign} -dev Phoenix -xclbin $< + +.PHONY: clean +clean: + -rm -r build diff --git a/programming_examples/dynamic_object_fifo/sliding_window/README.md b/programming_examples/dynamic_object_fifo/sliding_window/README.md new file mode 100644 index 0000000000..7e486e7da7 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/README.md @@ -0,0 +1,44 @@ + + +# Dynamic Object FIFO - Sliding Window + +Contains an example of what a ObjectFIFO lowering may look like that does not statically unroll loops, but instead chooses the buffers dynamically by using MLIR IndexSwitchOps and by keeping the ObjectFIFO state in the tile's local memory. + +This design implements the communication from external memory to a compute tile in the AIE array, and back. The input data consists of ten rows of 10xi32 tensors. Every iteration the compute tile acquires up to two input rows and adds the values on each column. It then releases only one of the two rows and continues onto the next input row following a sliding window pattern. + +The acquire / release patterns for the first and last rows are different than for the rows in the middle of the input. The first row is added to itself to account for the border effect and as such we only acquire one row during the first iteration and release none. For the last iteration, we release two rows to account for the sliding window of 1. + +## Source Files Overview + +1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +2. `aie2_if_else.py`: A variant of the Python script that uses if-else operations inside the compute tile's for loop to isolate the top and bottom rows of the input. (CURRENTLY NOT WORKING) + +3. `kernel.cc`: A C++ implementation of a simple add operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html). The source can be found [here](../../../aie_kernels/aie2/add.cc). + +4. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. + + +## Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +``` + +To run the design: + +``` +make run +``` diff --git a/programming_examples/dynamic_object_fifo/sliding_window/aie2.py b/programming_examples/dynamic_object_fifo/sliding_window/aie2.py new file mode 100644 index 0000000000..08d92c73e1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/aie2.py @@ -0,0 +1,76 @@ +# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 100 +n_rows = 10 +dev = AIEDevice.npu1_1col +col = 0 + + +def sliding_window(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // n_rows, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty) + of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + add_10_i32 = external_func( + "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1) + elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1) + call(add_10_i32, [elemInPre, elemInPre, elemOutPre]) + of_out.release(ObjectFifoPort.Produce, 1) + + for _ in range_(8): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + + elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1) + elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost]) + of_in.release(ObjectFifoPort.Consume, 2) + of_out.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +sliding_window() diff --git a/programming_examples/dynamic_object_fifo/sliding_window/aie2_if_else.py b/programming_examples/dynamic_object_fifo/sliding_window/aie2_if_else.py new file mode 100644 index 0000000000..8ab2dfa636 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/aie2_if_else.py @@ -0,0 +1,74 @@ +# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 100 +n_rows = 10 +dev = AIEDevice.npu1_1col +col = 0 + + +def sliding_window(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // n_rows, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty) + of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + add_10_i32 = external_func( + "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + for i in range_(10): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + if i == 0: + elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1) + call(add_10_i32, [elemInPre, elemInPre, elemOut]) + elif i == 9: + elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOut]) + of_in.release(ObjectFifoPort.Consume, 2) + else: + elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut]) + of_in.release(ObjectFifoPort.Consume, 1) + + of_out.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +sliding_window() diff --git a/programming_examples/dynamic_object_fifo/sliding_window/kernel.cc b/programming_examples/dynamic_object_fifo/sliding_window/kernel.cc new file mode 100644 index 0000000000..ddb474e102 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/kernel.cc @@ -0,0 +1,24 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include + +template +void add(const T_in *__restrict inA, const T_in *__restrict inB, + T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = inA[i] + inB[i]; + } +} + +extern "C" { + +void add_10_i32(const int *__restrict inA, const int *__restrict inB, + int *__restrict out) { + add(inA, inB, out); +} +} diff --git a/programming_examples/dynamic_object_fifo/sliding_window/run_makefile.lit b/programming_examples/dynamic_object_fifo/sliding_window/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/dynamic_object_fifo/sliding_window/test.cpp b/programming_examples/dynamic_object_fifo/sliding_window/test.cpp new file mode 100644 index 0000000000..971f17a60e --- /dev/null +++ b/programming_examples/dynamic_object_fifo/sliding_window/test.cpp @@ -0,0 +1,125 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#ifndef XCLBIN +#define XCLBIN "build/final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "build/insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define INPUT_SIZE (100 * sizeof(int)) // in bytes +#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes +#define WIDTH_SIZE (10 * sizeof(int)) // in bytes +#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE +#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE + +#include "test_utils.h" + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_input = + xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_output = + xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + int *buf_input = bo_input.map(); + std::cout << std::endl << std::endl << "Input: " << std::endl; + for (int i = 0; i < INPUT_ROWS; i++) { + std::cout << "row " << i << " : "; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) { + buf_input[i * INPUT_ROWS + j] = i; + std::cout << buf_input[i * INPUT_ROWS + j] << " "; + } + std::cout << std::endl << std::endl; + } + int *buf_output = bo_output.map(); + memset(buf_output, 0, OUTPUT_SIZE); + + // Instruction buffer for DMA configuration + void *buf_instr = bo_instr.map(); + memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + bool pass = true; + std::cout << std::endl << "Output: " << std::endl; + for (int i = 0; i < OUTPUT_ROWS; i++) { + std::cout << "row " << i << std::endl; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) { + int expected_output = 0; + if (i == 0) { + expected_output = buf_input[i * INPUT_ROWS] * 2; + } else { + expected_output = + buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS]; + } + std::cout << "expected: " << expected_output << ", "; + std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl; + pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output; + } + std::cout << std::endl << std::endl; + } + std::cout << std::endl << std::endl; + std::cout << (pass ? "PASS!" : "FAIL.") << std::endl; + + return 0; +} \ No newline at end of file diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/Makefile b/programming_examples/dynamic_object_fifo/two_core_sliding_window/Makefile new file mode 100644 index 0000000000..4e423e1df1 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +# --- + +# The following environment variables that point to the Xilinx runtime (XRT) +# should be set up by an environment setup script already. +XILINX_XRT?=/opt/xilinx/xrt +XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../) + +# --- + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +XILINX_XRT_INCLUDE?=${XILINX_XRT}/include +XILINX_XRT_LIB?=${XILINX_XRT}/lib + +CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include +XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB} +XRT_LIBS=-lxrt_coreutil +CXX=g++-13 -ggdb + +#mlir_target?=build/aie.mlir +xclbin_target?=build/final.xclbin +insts_target?=build/insts.txt +host_target?=build/test + +.PHONY: all +all: ${xclbin_target} ${host_target} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +${xclbin_target}: build/aie.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%} + +${host_target}: ${srcdir}/test.cpp ${xclbin_target} + mkdir -p ${@D} + ${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS} + +.PHONY: run +run: ${host_target} + ./${host_target} + +xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh +.PHONY: sign +sign: ${xclbin_target} + ${xclbin_sign} -dev Phoenix -xclbin $< + +.PHONY: clean +clean: + -rm -r build diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/README.md b/programming_examples/dynamic_object_fifo/two_core_sliding_window/README.md new file mode 100644 index 0000000000..806dba3377 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/README.md @@ -0,0 +1,40 @@ + + +# Dynamic Object FIFO - Two Core Sliding Window + +Contains an example of what a ObjectFIFO lowering may look like that does not statically unroll loops, but instead chooses the buffers dynamically by using MLIR IndexSwitchOps and by keeping the ObjectFIFO state in the tile's local memory. + +This design implements the communication from external memory to a first compute tile in the AIE array which sends the data to a second tile that computes the final output to send back to external memory. The input data consists of ten rows of 10xi32 tensors. The first compute tile applies a simple passthrough kernel on incoming data before sending it further. For every two rows, the second tile applies an addition following the same sliding window pattern shown in the [sliding_window](../sliding_window/) example. + +## Source Files Overview + +1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +2. `kernel.cc`: C++ implementations of passthrough and add operations for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html). The source can be found [here](../../../aie_kernels/aie2/add.cc). + +3. `test.cpp`: This C++ code is a testbench for the design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. + + +## Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +``` + +To run the design: + +``` +make run +``` diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/aie2.py b/programming_examples/dynamic_object_fifo/two_core_sliding_window/aie2.py new file mode 100644 index 0000000000..e815fada7c --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/aie2.py @@ -0,0 +1,90 @@ +# dynamic_object_fifo/two_core_sliding_window/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext.scf import _for as range_ +from aie.extras.context import mlir_mod_ctx + +N = 100 +n_rows = 10 +dev = AIEDevice.npu1_1col +col = 0 + + +def two_core_sliding_window(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(N // n_rows, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile = tile(col, 2) + ComputeTile2 = tile(col, 4) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile, 2, memRef_ty) + of_in2 = object_fifo("in2", ComputeTile, ComputeTile2, 3, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) + + # AIE Core Function declarations + passthrough_10_i32 = external_func( + "passthrough_10_i32", inputs=[memRef_ty, memRef_ty] + ) + add_10_i32 = external_func( + "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty] + ) + + # Set up compute tiles + + @core(ComputeTile, "kernel.o") + def core_body(): + for _ in range_(10): + elemOut = of_in2.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) + call(passthrough_10_i32, [elemIn, elemOut]) + of_in.release(ObjectFifoPort.Consume, 1) + of_in2.release(ObjectFifoPort.Produce, 1) + + @core(ComputeTile2, "kernel.o") + def core_body(): + elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1) + elemInPre = of_in2.acquire(ObjectFifoPort.Consume, 1) + call(add_10_i32, [elemInPre, elemInPre, elemOutPre]) + of_out.release(ObjectFifoPort.Produce, 1) + + for _ in range_(8): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemsIn = of_in2.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut]) + of_in2.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + + elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1) + elemsInPost = of_in2.acquire(ObjectFifoPort.Consume, 2) + call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost]) + of_in2.release(ObjectFifoPort.Consume, 2) + of_out.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(A, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +two_core_sliding_window() diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/kernel.cc b/programming_examples/dynamic_object_fifo/two_core_sliding_window/kernel.cc new file mode 100644 index 0000000000..7e4515193c --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/kernel.cc @@ -0,0 +1,38 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include + +template +void passthrough(const T_in *__restrict in, T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = in[i]; + } +} + +extern "C" { + +void passthrough_10_i32(const int *__restrict in, int *__restrict out) { + passthrough(in, out); +} +} + +template +void add(const T_in *__restrict inA, const T_in *__restrict inB, + T_out *__restrict out) { + for (int i = 0; i < N; i++) { + out[i] = inA[i] + inB[i]; + } +} + +extern "C" { + +void add_10_i32(const int *__restrict inA, const int *__restrict inB, + int *__restrict out) { + add(inA, inB, out); +} +} diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/run_makefile.lit b/programming_examples/dynamic_object_fifo/two_core_sliding_window/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/dynamic_object_fifo/two_core_sliding_window/test.cpp b/programming_examples/dynamic_object_fifo/two_core_sliding_window/test.cpp new file mode 100644 index 0000000000..971f17a60e --- /dev/null +++ b/programming_examples/dynamic_object_fifo/two_core_sliding_window/test.cpp @@ -0,0 +1,125 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#ifndef XCLBIN +#define XCLBIN "build/final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "build/insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define INPUT_SIZE (100 * sizeof(int)) // in bytes +#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes +#define WIDTH_SIZE (10 * sizeof(int)) // in bytes +#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE +#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE + +#include "test_utils.h" + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_input = + xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_output = + xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + int *buf_input = bo_input.map(); + std::cout << std::endl << std::endl << "Input: " << std::endl; + for (int i = 0; i < INPUT_ROWS; i++) { + std::cout << "row " << i << " : "; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) { + buf_input[i * INPUT_ROWS + j] = i; + std::cout << buf_input[i * INPUT_ROWS + j] << " "; + } + std::cout << std::endl << std::endl; + } + int *buf_output = bo_output.map(); + memset(buf_output, 0, OUTPUT_SIZE); + + // Instruction buffer for DMA configuration + void *buf_instr = bo_instr.map(); + memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + bool pass = true; + std::cout << std::endl << "Output: " << std::endl; + for (int i = 0; i < OUTPUT_ROWS; i++) { + std::cout << "row " << i << std::endl; + for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) { + int expected_output = 0; + if (i == 0) { + expected_output = buf_input[i * INPUT_ROWS] * 2; + } else { + expected_output = + buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS]; + } + std::cout << "expected: " << expected_output << ", "; + std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl; + pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output; + } + std::cout << std::endl << std::endl; + } + std::cout << std::endl << std::endl; + std::cout << (pass ? "PASS!" : "FAIL.") << std::endl; + + return 0; +} \ No newline at end of file diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index 343d3cf045..ff6a44e06e 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -158,6 +158,13 @@ def parse_args(args=None): action="store_true", help="Generate column-wise overlay of control packet routings", ) + parser.add_argument( + "--dynamic-objFifos", + dest="dynamic_objFifos", + default=False, + action="store_true", + help="Use dynamic object fifos for the for loops", + ) parser.add_argument( "--aie-generate-airbin", dest="airbin", diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index ef43a08bff..4071b5fb30 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -37,7 +37,7 @@ from aie.passmanager import PassManager INPUT_WITH_ADDRESSES_PIPELINE = ( - lambda basic_alloc_scheme=False, ctrl_pkt_overlay=False: ( + lambda basic_alloc_scheme=False, dynamic_objFifos=False, ctrl_pkt_overlay=False: ( Pipeline() .lower_affine() .add_pass("aie-canonicalize-device") @@ -46,7 +46,9 @@ Pipeline() .add_pass("aie-assign-lock-ids") .add_pass("aie-register-objectFifos") - .add_pass("aie-objectFifo-stateful-transform") + .add_pass( + "aie-objectFifo-stateful-transform", dynamic_objFifos=dynamic_objFifos + ) .add_pass("aie-assign-bd-ids") .add_pass("aie-lower-cascade-flows") .add_pass("aie-lower-broadcast-packet") @@ -1057,7 +1059,7 @@ async def run_flow(self): file_with_addresses = self.prepend_tmp("input_with_addresses.mlir") pass_pipeline = INPUT_WITH_ADDRESSES_PIPELINE( - opts.basic_alloc_scheme, opts.ctrl_pkt_overlay + opts.basic_alloc_scheme, opts.dynamic_objFifos, opts.ctrl_pkt_overlay ).materialize(module=True) run_passes( pass_pipeline, diff --git a/python/dialects/aie.py b/python/dialects/aie.py index a18701fefe..b9df2760b4 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -229,8 +229,13 @@ def bds(parent): class Core(CoreOp): # Until https://github.com/llvm/llvm-project/pull/73620 gets figured out. - def __init__(self, tile, link_with: str | None = None): - super().__init__(result=T.index(), tile=tile, link_with=link_with) + def __init__(self, tile, link_with=None, dynamic_objfifo_lowering=None): + super().__init__( + result=T.index(), + tile=tile, + link_with=link_with, + dynamic_objfifo_lowering=dynamic_objfifo_lowering, + ) # Create an aie buffer of (shape x datatype) on given tile. diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir new file mode 100644 index 0000000000..47f5fb34c6 --- /dev/null +++ b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir @@ -0,0 +1,137 @@ +//===- dynamic_lowering_flag_test.mlir --------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK: %core_0_2 = aie.core(%tile_0_2) { +// CHECK: %c0 = arith.constant 0 : index +// CHECK: %c0_0 = arith.constant 0 : index +// CHECK: %c2 = arith.constant 2 : index +// CHECK: memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1 = arith.constant 1 : index +// CHECK: %c2_1 = arith.constant 2 : index +// CHECK: memref.store %c0, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %c0_2 = arith.constant 0 : index +// CHECK: %c1_3 = arith.constant 1 : index +// CHECK: %c10 = arith.constant 10 : index +// CHECK: scf.for %arg0 = %c0_2 to %c10 step %c1_3 { +// CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) +// CHECK: %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %1 = scf.index_switch %0 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %output_fifo_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: %2 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %3 = scf.index_switch %2 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 1) +// CHECK: %4 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %c1_4 = arith.constant 1 : index +// CHECK: %5 = arith.addi %4, %c1_4 : index +// CHECK: %6 = arith.remsi %5, %c2_1 : index +// CHECK: memref.store %6, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) +// CHECK: %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1_5 = arith.constant 1 : index +// CHECK: %8 = arith.addi %7, %c1_5 : index +// CHECK: %9 = arith.remsi %8, %c2 : index +// CHECK: memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: } +// CHECK: aie.end +// CHECK: } {dynamic_objfifo_lowering = true} +// CHECK: aie.shim_dma_allocation @input_fifo(MM2S, 0, 0) +// CHECK: %core_0_4 = aie.core(%tile_0_4) { +// CHECK: %c0 = arith.constant 0 : index +// CHECK: %c1 = arith.constant 1 : index +// CHECK: %c10 = arith.constant 10 : index +// CHECK: %c2 = arith.constant 2 : index +// CHECK: scf.for %arg0 = %c0 to %c10 step %c2 { +// CHECK: aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: func.call @passthrough_10_i32(%input_fifo2_cons_buff_0, %output_fifo2_buff_0) : (memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1) +// CHECK: aie.use_lock(%output_fifo2_cons_lock, Release, 1) +// CHECK: aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1) +// CHECK: aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: func.call @passthrough_10_i32(%input_fifo2_cons_buff_1, %output_fifo2_buff_1) : (memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1) +// CHECK: aie.use_lock(%output_fifo2_cons_lock, Release, 1) +// CHECK: } +// CHECK: aie.end +// CHECK: } + +module { + aie.device(npu1_1col) { + func.func @passthrough_10_i32(%line_in: memref<10xi32>, %line_out: memref<10xi32>) -> () { + return + } + + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %tile_0_4 = aie.tile(0, 4) + aie.objectfifo @input_fifo(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @output_fifo(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> + + aie.objectfifo @input_fifo2(%tile_0_0, {%tile_0_4}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @output_fifo2(%tile_0_4, {%tile_0_0}, 2 : i32) : !aie.objectfifo> + + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + + scf.for %arg0 = %c0 to %c10 step %c1 { + %0 = aie.objectfifo.acquire @output_fifo(Produce, 1) : !aie.objectfifosubview> + %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<10xi32> + %2 = aie.objectfifo.acquire @input_fifo(Consume, 1) : !aie.objectfifosubview> + %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<10xi32> + func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> () + aie.objectfifo.release @input_fifo(Consume, 1) + aie.objectfifo.release @output_fifo(Produce, 1) + } + + aie.end + } {dynamic_objfifo_lowering = true} + + %core_0_4 = aie.core(%tile_0_4) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + + scf.for %arg0 = %c0 to %c10 step %c1 { + %0 = aie.objectfifo.acquire @output_fifo2(Produce, 1) : !aie.objectfifosubview> + %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<10xi32> + %2 = aie.objectfifo.acquire @input_fifo2(Consume, 1) : !aie.objectfifosubview> + %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<10xi32> + func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> () + aie.objectfifo.release @input_fifo2(Consume, 1) + aie.objectfifo.release @output_fifo2(Produce, 1) + } + + aie.end + } + } +} diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir new file mode 100644 index 0000000000..5bb762b29d --- /dev/null +++ b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir @@ -0,0 +1,244 @@ +//===- dynamic_lowering_test.mlir ------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform=dynamic-objFifos %s | FileCheck %s + +// CHECK: aie.device(npu1_1col) { +// CHECK: memref.global "public" @output_fifo_cons : memref<10xi32> +// CHECK: memref.global "public" @output_fifo : memref<10xi32> +// CHECK: memref.global "public" @input_fifo_cons : memref<10xi32> +// CHECK: memref.global "public" @input_fifo : memref<10xi32> +// CHECK: func.func @add_10_i32(%arg0: memref<10xi32>, %arg1: memref<10xi32>, %arg2: memref<10xi32>) { +// CHECK: return +// CHECK: } +// CHECK: %tile_0_0 = aie.tile(0, 0) +// CHECK: %tile_0_2 = aie.tile(0, 2) +// CHECK: %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "output_fifo_cons_prod_lock"} +// CHECK: %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"} +// CHECK: %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> +// CHECK: %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> +// CHECK: %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"} +// CHECK: %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"} +// CHECK: %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> +// CHECK: %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> +// CHECK: %input_fifo_cons_buff_2 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_2"} : memref<10xi32> +// CHECK: %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 3 : i32, sym_name = "input_fifo_cons_prod_lock"} +// CHECK: %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"} +// CHECK: %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "input_fifo_prod_lock"} +// CHECK: %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"} +// CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) +// CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) +// CHECK: aie.shim_dma_allocation @input_fifo(MM2S, 0, 0) +// CHECK: %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> +// CHECK: %core_0_2 = aie.core(%tile_0_2) { +// CHECK: %c0 = arith.constant 0 : index +// CHECK: %c0_0 = arith.constant 0 : index +// CHECK: %c2 = arith.constant 2 : index +// CHECK: memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1 = arith.constant 1 : index +// CHECK: %c3 = arith.constant 3 : index +// CHECK: memref.store %c0, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %c0_1 = arith.constant 0 : index +// CHECK: %c1_2 = arith.constant 1 : index +// CHECK: %c9 = arith.constant 9 : index +// CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) +// CHECK: %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %1 = scf.index_switch %0 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %output_fifo_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: %2 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %3 = scf.index_switch %2 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: case 2 { +// CHECK: scf.yield %input_fifo_cons_buff_2 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: func.call @add_10_i32(%3, %3, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) +// CHECK: %4 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1_3 = arith.constant 1 : index +// CHECK: %5 = arith.addi %4, %c1_3 : index +// CHECK: %6 = arith.remsi %5, %c2 : index +// CHECK: memref.store %6, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: scf.for %arg0 = %c0_1 to %c9 step %c1_2 { +// CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) +// CHECK: %19 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %20 = scf.index_switch %19 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %output_fifo_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: %21 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %22 = scf.index_switch %21 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: case 2 { +// CHECK: scf.yield %input_fifo_cons_buff_2 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: %23 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %24 = scf.index_switch %23 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_2 : memref<10xi32> +// CHECK: } +// CHECK: case 2 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: func.call @add_10_i32(%22, %24, %20) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 1) +// CHECK: %25 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %c1_6 = arith.constant 1 : index +// CHECK: %26 = arith.addi %25, %c1_6 : index +// CHECK: %27 = arith.remsi %26, %c3 : index +// CHECK: memref.store %27, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) +// CHECK: %28 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1_7 = arith.constant 1 : index +// CHECK: %29 = arith.addi %28, %c1_7 : index +// CHECK: %30 = arith.remsi %29, %c2 : index +// CHECK: memref.store %30, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: } +// CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) +// CHECK: %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %8 = scf.index_switch %7 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %output_fifo_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) +// CHECK: %9 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %10 = scf.index_switch %9 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: case 2 { +// CHECK: scf.yield %input_fifo_cons_buff_2 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: %11 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %12 = scf.index_switch %11 -> memref<10xi32> +// CHECK: case 0 { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: case 1 { +// CHECK: scf.yield %input_fifo_cons_buff_2 : memref<10xi32> +// CHECK: } +// CHECK: case 2 { +// CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> +// CHECK: } +// CHECK: default { +// CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> +// CHECK: } +// CHECK: func.call @add_10_i32(%10, %12, %8) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 2) +// CHECK: %13 = memref.load %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %c2_4 = arith.constant 2 : index +// CHECK: %14 = arith.addi %13, %c2_4 : index +// CHECK: %15 = arith.remsi %14, %c3 : index +// CHECK: memref.store %15, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) +// CHECK: %16 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c1_5 = arith.constant 1 : index +// CHECK: %17 = arith.addi %16, %c1_5 : index +// CHECK: %18 = arith.remsi %17, %c2 : index +// CHECK: memref.store %18, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: aie.end +// CHECK: } + +module { + aie.device(npu1_1col) { + func.func @add_10_i32(%line_in1: memref<10xi32>, %line_in2: memref<10xi32>, %line_out: memref<10xi32>) -> () { + return + } + + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @input_fifo(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @output_fifo(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> + + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 9 : index + + %0 = aie.objectfifo.acquire @output_fifo(Produce, 1) : !aie.objectfifosubview> + %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<10xi32> + %2 = aie.objectfifo.acquire @input_fifo(Consume, 1) : !aie.objectfifosubview> + %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<10xi32> + func.call @add_10_i32(%3, %3, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + aie.objectfifo.release @output_fifo(Produce, 1) + + scf.for %arg0 = %c0 to %c8 step %c1 { + %4 = aie.objectfifo.acquire @output_fifo(Produce, 1) : !aie.objectfifosubview> + %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<10xi32> + %6 = aie.objectfifo.acquire @input_fifo(Consume, 2) : !aie.objectfifosubview> + %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview> -> memref<10xi32> + %8 = aie.objectfifo.subview.access %6[1] : !aie.objectfifosubview> -> memref<10xi32> + func.call @add_10_i32(%7, %8, %5) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + aie.objectfifo.release @input_fifo(Consume, 1) + aie.objectfifo.release @output_fifo(Produce, 1) + } + + %9 = aie.objectfifo.acquire @output_fifo(Produce, 1) : !aie.objectfifosubview> + %10 = aie.objectfifo.subview.access %9[0] : !aie.objectfifosubview> -> memref<10xi32> + %11 = aie.objectfifo.acquire @input_fifo(Consume, 2) : !aie.objectfifosubview> + %12 = aie.objectfifo.subview.access %11[0] : !aie.objectfifosubview> -> memref<10xi32> + %13 = aie.objectfifo.subview.access %11[1] : !aie.objectfifosubview> -> memref<10xi32> + func.call @add_10_i32(%12, %13, %10) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + aie.objectfifo.release @input_fifo(Consume, 2) + aie.objectfifo.release @output_fifo(Produce, 1) + + aie.end + } + } +}