diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 82691ccf29..393365fb9e 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -69,47 +69,65 @@ class LockAnalysis { }; //===----------------------------------------------------------------------===// -// TileDMA Channel Analysis +// DMA Channel Analysis //===----------------------------------------------------------------------===// class DMAChannelAnalysis { - DenseMap masterChannelsPerTile; - DenseMap slaveChannelsPerTile; + DenseMap, int> channelsPerTile; public: DMAChannelAnalysis(DeviceOp &device) { - // go over the channels used for each tile and update the master/slave - // channel maps + // go over the channels used for each tile and update channel map for (auto memOp : device.getOps()) { Region &r = memOp.getBody(); for (auto &bl : r.getBlocks()) { for (auto op : bl.getOps()) { - if (op.isSend()) - getMasterDMAChannel(memOp.getTile()); - else - getSlaveDMAChannel(memOp.getTile()); + channelsPerTile[{memOp.getTile(), op.getChannelDir(), + op.getChannelIndex()}] = 1; + } + } + } + for (auto memOp : device.getOps()) { + Region &r = memOp.getBody(); + for (auto &bl : r.getBlocks()) { + for (auto op : bl.getOps()) { + channelsPerTile[{memOp.getTile(), op.getChannelDir(), + op.getChannelIndex()}] = 1; + } + } + } + for (auto memOp : device.getOps()) { + Region &r = memOp.getBody(); + for (auto &bl : r.getBlocks()) { + for (auto op : bl.getOps()) { + channelsPerTile[{memOp.getTile(), op.getChannelDir(), + op.getChannelIndex()}] = 1; } } } } - /// Given an AIE tile, returns its next usable master channel. - DMAChannel getMasterDMAChannel(Value tile) { - if (masterChannelsPerTile.find(tile) == masterChannelsPerTile.end()) - masterChannelsPerTile[tile] = 0; - else - masterChannelsPerTile[tile]++; - DMAChannel dmaChan = {DMAChannelDir::MM2S, masterChannelsPerTile[tile]}; - return dmaChan; - } - - /// Given an AIE tile, returns its next usable slave channel. - DMAChannel getSlaveDMAChannel(Value tile) { - if (slaveChannelsPerTile.find(tile) == slaveChannelsPerTile.end()) - slaveChannelsPerTile[tile] = 0; - else - slaveChannelsPerTile[tile]++; - DMAChannel dmaChan = {DMAChannelDir::S2MM, slaveChannelsPerTile[tile]}; - return dmaChan; + /// Given a tile and DMAChannelDir, returns next usable channel index for + /// that tile. + int getDMAChannelIndex(TileOp tileOp, DMAChannelDir dir) { + const auto &targetModel = getTargetModel(tileOp); + int maxChannelNum = 0; + if (tileOp.isShimTile()) + maxChannelNum = 2; + else { + if (dir == DMAChannelDir::MM2S) + maxChannelNum = targetModel.getNumSourceSwitchboxConnections( + tileOp.getCol(), tileOp.getRow(), WireBundle::DMA); + else + maxChannelNum = targetModel.getNumDestSwitchboxConnections( + tileOp.getCol(), tileOp.getRow(), WireBundle::DMA); + } + for (int i = 0; i < maxChannelNum; i++) + if (int usageCnt = channelsPerTile[{tileOp.getResult(), dir, i}]; + usageCnt == 0) { + channelsPerTile[{tileOp.getResult(), dir, i}] = 1; + return i; + } + return -1; } }; @@ -1518,8 +1536,12 @@ struct AIEObjectFifoStatefulTransformPass // rely on shared memory and share the same buffers. for (auto &[producer, consumers] : splitFifos) { // create producer tile DMA - DMAChannel producerChan = - dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); + int producerChanIndex = dmaAnalysis.getDMAChannelIndex( + producer.getProducerTileOp(), DMAChannelDir::MM2S); + if (producerChanIndex == -1) + producer.getProducerTileOp().emitOpError( + "number of output DMA channel exceeded!"); + DMAChannel producerChan = {DMAChannelDir::MM2S, producerChanIndex}; createDMA(device, builder, producer, producerChan.direction, producerChan.channel, 0, producer.getDimensionsToStreamAttr(), producer.getPadDimensionsAttr()); @@ -1535,8 +1557,12 @@ struct AIEObjectFifoStatefulTransformPass for (auto consumer : consumers) { // create consumer tile DMA - DMAChannel consumerChan = - dmaAnalysis.getSlaveDMAChannel(consumer.getProducerTile()); + int consumerChanIndex = dmaAnalysis.getDMAChannelIndex( + consumer.getProducerTileOp(), DMAChannelDir::S2MM); + if (consumerChanIndex == -1) + consumer.getProducerTileOp().emitOpError( + "number of input DMA channel exceeded!"); + DMAChannel consumerChan = {DMAChannelDir::S2MM, consumerChanIndex}; BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir deleted file mode 100644 index 52bc3e1af6..0000000000 --- a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir +++ /dev/null @@ -1,240 +0,0 @@ -//===- aie.mlir ------------------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -// -//===----------------------------------------------------------------------===// - -module { - aie.device(npu1_4col) { - memref.global "public" @out_cons : memref<16xi32> - memref.global "public" @out : memref<16xi32> - memref.global "public" @in2_mem_cons : memref<256xi32> - memref.global "public" @in2_mem : memref<256xi32> - memref.global "public" @in1_cons : memref<16xi32> - memref.global "public" @in1 : memref<16xi32> - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_1_1 = aie.tile(1, 1) - %tile_2_1 = aie.tile(2, 1) - %tile_0_2 = aie.tile(0, 2) - %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<16xi32> - %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<16xi32> - %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"} - %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"} - %in2_mem_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_0"} : memref<256xi32> - %in2_mem_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_1"} : memref<256xi32> - %in2_mem_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_mem_cons_prod_lock"} - %in2_mem_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_mem_cons_cons_lock"} - %in2_mem_buff_0 = aie.buffer(%tile_0_1) {sym_name = "in2_mem_buff_0"} : memref<64xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]> - %in2_mem_buff_1 = aie.buffer(%tile_1_1) {sym_name = "in2_mem_buff_1"} : memref<64xi32> = dense<[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]> - %in2_mem_buff_2 = aie.buffer(%tile_2_1) {sym_name = "in2_mem_buff_2"} : memref<64xi32> = dense<[128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]> - %in2_mem_buff_3 = aie.buffer(%tile_2_1) {sym_name = "in2_mem_buff_3"} : memref<64xi32> = dense<[192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]> - %in2_mem_prod_lock = aie.lock(%tile_0_1, 0) {init = 0 : i32, sym_name = "in2_mem_prod_lock"} - %in2_mem_cons_lock = aie.lock(%tile_0_1, 1) {init = 1 : i32, sym_name = "in2_mem_cons_lock"} - %in3_mem_prod_lock = aie.lock(%tile_1_1, 0) {init = 0 : i32, sym_name = "in3_mem_prod_lock"} - %in3_mem_cons_lock = aie.lock(%tile_1_1, 1) {init = 1 : i32, sym_name = "in3_mem_cons_lock"} - %in4_mem_prod_lock = aie.lock(%tile_2_1, 0) {init = 0 : i32, sym_name = "in4_mem_prod_lock"} - %in4_mem_cons_lock = aie.lock(%tile_2_1, 1) {init = 2 : i32, sym_name = "in4_mem_cons_lock"} - %in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_0"} : memref<16xi32> - %in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_1"} : memref<16xi32> - %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"} - %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"} - aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1) - aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) - %core_0_2 = aie.core(%tile_0_2) { - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c9223372036854775806 = arith.constant 9223372036854775806 : index - %c2 = arith.constant 2 : index - scf.for %arg0 = %c0 to %c9223372036854775806 step %c2 { - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32> - %2 = arith.muli %arg1, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg2] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg1, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg2] : memref<16xi32> - %6 = arith.addi %arg2, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32> - %2 = arith.muli %arg1, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg2] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg1, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg2] : memref<16xi32> - %6 = arith.addi %arg2, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_1[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg1] : memref<16xi32> - %2 = arith.muli %arg0, %c16 : index - %3 = arith.addi %arg1, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg1] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg0, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg1] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg1, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg1] : memref<16xi32> - %6 = arith.addi %arg1, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - aie.end - } - aie.shim_dma_allocation @in1(MM2S, 0, 0) - aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> - aiex.npu.dma_wait {symbol = @out} - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16) - aie.use_lock(%in1_cons_cons_lock, Release, 1) - aie.next_bd ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16) - aie.use_lock(%in1_cons_cons_lock, Release, 1) - aie.next_bd ^bb1 - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6) - ^bb4: // 2 preds: ^bb3, ^bb5 - aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_cons_buff_0 : memref<256xi32>, 0, 256) - aie.use_lock(%in2_mem_cons_cons_lock, Release, 1) - aie.next_bd ^bb5 - ^bb5: // pred: ^bb4 - aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_cons_buff_1 : memref<256xi32>, 0, 256) - aie.use_lock(%in2_mem_cons_cons_lock, Release, 1) - aie.next_bd ^bb4 - ^bb6: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9) - ^bb7: // 2 preds: ^bb6, ^bb8 - aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16) - aie.use_lock(%out_prod_lock, Release, 1) - aie.next_bd ^bb8 - ^bb8: // pred: ^bb7 - aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16) - aie.use_lock(%out_prod_lock, Release, 1) - aie.next_bd ^bb7 - ^bb9: // pred: ^bb6 - aie.end - } - aie.shim_dma_allocation @out(S2MM, 0, 0) - %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb5) - ^bb1: - aie.use_lock(%in2_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_0 : memref<64xi32>, 0, 64) - aie.use_lock(%in2_mem_prod_lock, Release, 1) - aie.next_bd ^bb2 - ^bb2: - aie.use_lock(%in3_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_1 : memref<64xi32>, 0, 64) - aie.use_lock(%in3_mem_prod_lock, Release, 1) - aie.next_bd ^bb3 - ^bb3: - aie.use_lock(%in4_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_2 : memref<64xi32>, 0, 64) - aie.use_lock(%in4_mem_prod_lock, Release, 1) - aie.next_bd ^bb4 - ^bb4: - aie.use_lock(%in4_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_3 : memref<64xi32>, 0, 64) - aie.use_lock(%in4_mem_prod_lock, Release, 1) - aie.next_bd ^bb5 - ^bb5: - aie.end - } - } -} - diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py new file mode 100644 index 0000000000..4e5bd39bbc --- /dev/null +++ b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py @@ -0,0 +1,206 @@ +# vector_vector_add/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: ryzen_ai +# +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +# CHECK: PASS! +import numpy as np +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.helpers.dialects.ext.scf import _for as range_ +from aie.dialects import memref + + +def my_vector_add(): + N = 256 + n = 16 + N_div_n = N // n + + @device(AIEDevice.npu1_4col) + def device_body(): + # AIE Core Function declarations + tensor_ty_c = np.ndarray[(N,), np.dtype[np.int32]] + tensor_ty = np.ndarray[(N // 4,), np.dtype[np.int32]] + tensor_ty_s = np.ndarray[(n,), np.dtype[np.int32]] + + memref.global_("out", T.memref(16, T.i32()), sym_visibility="public") + memref.global_("in1", T.memref(16, T.i32()), sym_visibility="public") + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + MemTile2 = tile(1, 1) + MemTile3 = tile(2, 1) + ComputeTile2 = tile(0, 2) + + # MemTile elements + in2_mem_prod_lock = lock(MemTile, lock_id=0, init=0) + in2_mem_cons_lock = lock(MemTile, lock_id=1, init=1) + in2_mem_buff_0 = buffer( + tile=MemTile, + datatype=tensor_ty, + name="in2_mem_buff_0", + initial_value=np.arange(N // 4, dtype=np.int32), + ) + + # MemTile2 elements + in3_mem_prod_lock = lock(MemTile2, lock_id=0, init=0) + in3_mem_cons_lock = lock(MemTile2, lock_id=1, init=1) + in3_mem_buff_0 = buffer( + tile=MemTile2, + datatype=tensor_ty, + name="in3_mem_buff_0", + initial_value=np.arange(N // 4, (N // 4) * 2, dtype=np.int32), + ) + + # MemTile3 elements + in4_mem_prod_lock = lock(MemTile3, lock_id=0, init=0) + in4_mem_cons_lock = lock(MemTile3, lock_id=1, init=2) + in4_mem_buff_0 = buffer( + tile=MemTile3, + datatype=tensor_ty, + name="in4_mem_buff_0", + initial_value=np.arange((N // 4) * 2, (N // 4) * 3, dtype=np.int32), + ) + in4_mem_buff_1 = buffer( + tile=MemTile3, + datatype=tensor_ty, + name="in4_mem_buff_1", + initial_value=np.arange((N // 4) * 3, N, dtype=np.int32), + ) + + # ComputeTile2 elements + # Input from ShimTile + in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1) + in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0) + in1_cons_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_s, + name="in1_cons_buff_0", + initial_value=np.arange(n, dtype=np.int32), + ) + # Input from MemTile + in2_mem_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=1) + in2_mem_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=0) + in2_mem_cons_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_c, + name="in2_mem_cons_buff_0", + initial_value=np.arange(N, dtype=np.int32), + ) + # Output to ShimTile + out_prod_lock = lock(ComputeTile2, lock_id=4, init=1) + out_cons_lock = lock(ComputeTile2, lock_id=5, init=0) + out_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_s, + name="out_buff_0", + initial_value=np.arange(n, dtype=np.int32), + ) + + flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0) + flow(MemTile2, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 1) + flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0) + + # AIE-array data movement + shim_dma_allocation("in1", DMAChannelDir.MM2S, 0, 0) + shim_dma_allocation("out", DMAChannelDir.S2MM, 0, 0) + + @memtile_dma(MemTile2) + def m(block): + s0 = dma_start(DMAChannelDir.MM2S, 0, dest=block[1], chain=block[5]) + with block[1]: + use_lock(in2_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in2_mem_buff_0) + use_lock(in2_mem_prod_lock, LockAction.Release) + next_bd(block[2]) + with block[2]: + use_lock(in3_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in3_mem_buff_0) + use_lock(in3_mem_prod_lock, LockAction.Release) + next_bd(block[3]) + with block[3]: + use_lock(in4_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in4_mem_buff_0) + use_lock(in4_mem_prod_lock, LockAction.Release) + next_bd(block[4]) + with block[4]: + use_lock(in4_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in4_mem_buff_1) + use_lock(in4_mem_prod_lock, LockAction.Release) + next_bd(block[1]) + with block[5]: + EndOp() + + @mem(ComputeTile2) + def m(block): + s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2]) + with block[1]: + use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual) + dma_bd(in1_cons_buff_0) + use_lock(in1_cons_cons_lock, LockAction.Release) + next_bd(block[1]) + with block[2]: + s1 = dma_start(DMAChannelDir.S2MM, 1, dest=block[3], chain=block[4]) + with block[3]: + use_lock(in2_mem_cons_prod_lock, LockAction.AcquireGreaterEqual) + dma_bd(in2_mem_cons_buff_0) + use_lock(in2_mem_cons_cons_lock, LockAction.Release) + next_bd(block[3]) + with block[4]: + s2 = dma_start(DMAChannelDir.MM2S, 0, dest=block[5], chain=block[6]) + with block[5]: + use_lock(out_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(out_buff_0) + use_lock(out_prod_lock, LockAction.Release) + next_bd(block[5]) + with block[6]: + EndOp() + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in range_(sys.maxsize): + # Number of sub-vector "tile" iterations + use_lock(in2_mem_cons_cons_lock, LockAction.AcquireGreaterEqual) + for j in range_(N_div_n): + use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual) + use_lock(out_prod_lock, LockAction.AcquireGreaterEqual) + for i in range_(n): + out_buff_0[i] = ( + in2_mem_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i] + ) + use_lock(in1_cons_prod_lock, LockAction.Release) + use_lock(out_cons_lock, LockAction.Release) + use_lock(in2_mem_cons_prod_lock, LockAction.Release) + + # To/from AIE-array data movement + @runtime_sequence(tensor_ty_c, tensor_ty_c, tensor_ty_c) + def sequence(A, B, C): + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_wait("out") + + +with mlir_mod_ctx() as ctx: + my_vector_add() + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit b/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit deleted file mode 100644 index 81d333bfd4..0000000000 --- a/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai -// -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir -// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// CHECK: PASS! - diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir deleted file mode 100644 index fcffc71e9a..0000000000 --- a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir +++ /dev/null @@ -1,225 +0,0 @@ -//===- aie.mlir ------------------------------------------------*- MLIR -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -// -//===----------------------------------------------------------------------===// - -module { - aie.device(npu1_4col) { - memref.global "public" @out_cons : memref<16xi32> - memref.global "public" @out : memref<16xi32> - memref.global "public" @in2_mem_cons : memref<256xi32> - memref.global "public" @in2_mem : memref<256xi32> - memref.global "public" @in1_cons : memref<16xi32> - memref.global "public" @in1 : memref<16xi32> - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_1_1 = aie.tile(1, 1) - %tile_0_2 = aie.tile(0, 2) - %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<16xi32> - %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<16xi32> - %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"} - %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"} - %in2_mem_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_0"} : memref<256xi32> - %in2_mem_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_1"} : memref<256xi32> - %in2_mem_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_mem_cons_prod_lock"} - %in2_mem_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_mem_cons_cons_lock"} - %in2_mem_buff_0 = aie.buffer(%tile_0_1) {sym_name = "in2_mem_buff_0"} : memref<128xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]> - %in2_mem_buff_1 = aie.buffer(%tile_1_1) {sym_name = "in2_mem_buff_1"} : memref<128xi32> = dense<[128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]> - %in2_mem_prod_lock = aie.lock(%tile_0_1, 0) {init = 0 : i32, sym_name = "in2_mem_prod_lock"} - %in2_mem_cons_lock = aie.lock(%tile_0_1, 1) {init = 1 : i32, sym_name = "in2_mem_cons_lock"} - %in3_mem_prod_lock = aie.lock(%tile_1_1, 0) {init = 0 : i32, sym_name = "in3_mem_prod_lock"} - %in3_mem_cons_lock = aie.lock(%tile_1_1, 1) {init = 1 : i32, sym_name = "in3_mem_cons_lock"} - %in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_0"} : memref<16xi32> - %in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_1"} : memref<16xi32> - %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"} - %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"} - aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 1) - aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) - %core_0_2 = aie.core(%tile_0_2) { - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c9223372036854775806 = arith.constant 9223372036854775806 : index - %c2 = arith.constant 2 : index - scf.for %arg0 = %c0 to %c9223372036854775806 step %c2 { - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32> - %2 = arith.muli %arg1, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg2] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg1, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg2] : memref<16xi32> - %6 = arith.addi %arg2, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32> - %2 = arith.muli %arg1, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg2] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg1, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg2 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg2, %2 : index - %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg2] : memref<16xi32> - %6 = arith.addi %arg2, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_1[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c16 step %c2 { - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c1 { - %1 = memref.load %in1_cons_buff_0[%arg1] : memref<16xi32> - %2 = arith.muli %arg0, %c16 : index - %3 = arith.addi %arg1, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_0[%arg1] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - %0 = arith.addi %arg0, %c1 : index - aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) - aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) - scf.for %arg1 = %c0 to %c16 step %c2 { - %1 = memref.load %in1_cons_buff_1[%arg1] : memref<16xi32> - %2 = arith.muli %0, %c16 : index - %3 = arith.addi %arg1, %2 : index - %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32> - %5 = arith.addi %1, %4 : i32 - memref.store %5, %out_buff_1[%arg1] : memref<16xi32> - %6 = arith.addi %arg1, %c1 : index - %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32> - %8 = arith.muli %0, %c16 : index - %9 = arith.addi %6, %8 : index - %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32> - %11 = arith.addi %7, %10 : i32 - memref.store %11, %out_buff_1[%6] : memref<16xi32> - } - aie.use_lock(%in1_cons_prod_lock, Release, 1) - aie.use_lock(%out_cons_lock, Release, 1) - } - aie.use_lock(%in2_mem_cons_prod_lock, Release, 1) - aie.end - } - aie.shim_dma_allocation @in1(MM2S, 0, 0) - aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> - aiex.npu.dma_wait {symbol = @out} - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16) - aie.use_lock(%in1_cons_cons_lock, Release, 1) - aie.next_bd ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16) - aie.use_lock(%in1_cons_cons_lock, Release, 1) - aie.next_bd ^bb1 - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6) - ^bb4: // 2 preds: ^bb3, ^bb5 - aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_cons_buff_0 : memref<256xi32>, 0, 256) - aie.use_lock(%in2_mem_cons_cons_lock, Release, 1) - aie.next_bd ^bb5 - ^bb5: // pred: ^bb4 - aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_cons_buff_1 : memref<256xi32>, 0, 256) - aie.use_lock(%in2_mem_cons_cons_lock, Release, 1) - aie.next_bd ^bb4 - ^bb6: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9) - ^bb7: // 2 preds: ^bb6, ^bb8 - aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16) - aie.use_lock(%out_prod_lock, Release, 1) - aie.next_bd ^bb8 - ^bb8: // pred: ^bb7 - aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16) - aie.use_lock(%out_prod_lock, Release, 1) - aie.next_bd ^bb7 - ^bb9: // pred: ^bb6 - aie.end - } - aie.shim_dma_allocation @out(S2MM, 0, 0) - %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: - aie.use_lock(%in2_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_0 : memref<128xi32>, 0, 128) - aie.use_lock(%in2_mem_prod_lock, Release, 1) - aie.next_bd ^bb2 - ^bb2: - aie.use_lock(%in3_mem_cons_lock, AcquireGreaterEqual, 1) - aie.dma_bd(%in2_mem_buff_1 : memref<128xi32>, 0, 128) - aie.use_lock(%in3_mem_prod_lock, Release, 1) - aie.next_bd ^bb1 - ^bb3: - aie.end - } - } -} - diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py new file mode 100644 index 0000000000..3092d94e37 --- /dev/null +++ b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py @@ -0,0 +1,179 @@ +# vector_vector_add/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# REQUIRES: ryzen_ai +# +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +# CHECK: PASS! +import numpy as np +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.helpers.dialects.ext.scf import _for as range_ +from aie.dialects import memref + + +def my_vector_add(): + N = 256 + n = 16 + N_div_n = N // n + + @device(AIEDevice.npu1_4col) + def device_body(): + # AIE Core Function declarations + tensor_ty_c = np.ndarray[(N,), np.dtype[np.int32]] + tensor_ty = np.ndarray[(N // 2,), np.dtype[np.int32]] + tensor_ty_s = np.ndarray[(n,), np.dtype[np.int32]] + + memref.global_("out", T.memref(16, T.i32()), sym_visibility="public") + memref.global_("in1", T.memref(16, T.i32()), sym_visibility="public") + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + MemTile2 = tile(1, 1) + ComputeTile2 = tile(0, 2) + + # MemTile elements + in2_mem_prod_lock = lock(MemTile, lock_id=0, init=0) + in2_mem_cons_lock = lock(MemTile, lock_id=1, init=1) + in2_mem_buff_0 = buffer( + tile=MemTile, + datatype=tensor_ty, + name="in2_mem_buff_0", + initial_value=np.arange(N // 2, dtype=np.int32), + ) + + # MemTile2 elements + in3_mem_prod_lock = lock(MemTile2, lock_id=0, init=0) + in3_mem_cons_lock = lock(MemTile2, lock_id=1, init=1) + in3_mem_buff_0 = buffer( + tile=MemTile2, + datatype=tensor_ty, + name="in3_mem_buff_0", + initial_value=np.arange(N // 2, N, dtype=np.int32), + ) + + # ComputeTile2 elements + # Input from ShimTile + in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1) + in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0) + in1_cons_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_s, + name="in1_cons_buff_0", + initial_value=np.arange(n, dtype=np.int32), + ) + # Input from MemTile + in2_mem_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=1) + in2_mem_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=0) + in2_mem_cons_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_c, + name="in2_mem_cons_buff_0", + initial_value=np.arange(N, dtype=np.int32), + ) + # Output to ShimTile + out_prod_lock = lock(ComputeTile2, lock_id=4, init=1) + out_cons_lock = lock(ComputeTile2, lock_id=5, init=0) + out_buff_0 = buffer( + tile=ComputeTile2, + datatype=tensor_ty_s, + name="out_buff_0", + initial_value=np.arange(n, dtype=np.int32), + ) + + flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0) + flow(MemTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 1) + flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0) + + # AIE-array data movement + shim_dma_allocation("in1", DMAChannelDir.MM2S, 0, 0) + shim_dma_allocation("out", DMAChannelDir.S2MM, 0, 0) + + @memtile_dma(MemTile) + def m(block): + s0 = dma_start(DMAChannelDir.MM2S, 0, dest=block[1], chain=block[3]) + with block[1]: + use_lock(in2_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in2_mem_buff_0) + use_lock(in2_mem_prod_lock, LockAction.Release) + next_bd(block[2]) + with block[2]: + use_lock(in3_mem_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(in3_mem_buff_0) + use_lock(in3_mem_prod_lock, LockAction.Release) + next_bd(block[1]) + with block[3]: + EndOp() + + @mem(ComputeTile2) + def m(block): + s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2]) + with block[1]: + use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual) + dma_bd(in1_cons_buff_0) + use_lock(in1_cons_cons_lock, LockAction.Release) + next_bd(block[1]) + with block[2]: + s1 = dma_start(DMAChannelDir.S2MM, 1, dest=block[3], chain=block[4]) + with block[3]: + use_lock(in2_mem_cons_prod_lock, LockAction.AcquireGreaterEqual) + dma_bd(in2_mem_cons_buff_0) + use_lock(in2_mem_cons_cons_lock, LockAction.Release) + next_bd(block[3]) + with block[4]: + s2 = dma_start(DMAChannelDir.MM2S, 0, dest=block[5], chain=block[6]) + with block[5]: + use_lock(out_cons_lock, LockAction.AcquireGreaterEqual) + dma_bd(out_buff_0) + use_lock(out_prod_lock, LockAction.Release) + next_bd(block[5]) + with block[6]: + EndOp() + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in range_(sys.maxsize): + # Number of sub-vector "tile" iterations + use_lock(in2_mem_cons_cons_lock, LockAction.AcquireGreaterEqual) + for j in range_(N_div_n): + use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual) + use_lock(out_prod_lock, LockAction.AcquireGreaterEqual) + for i in range_(n): + out_buff_0[i] = ( + in2_mem_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i] + ) + use_lock(in1_cons_prod_lock, LockAction.Release) + use_lock(out_cons_lock, LockAction.Release) + use_lock(in2_mem_cons_prod_lock, LockAction.Release) + + # To/from AIE-array data movement + @runtime_sequence(tensor_ty_c, tensor_ty_c, tensor_ty_c) + def sequence(A, B, C): + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_wait("out") + + +with mlir_mod_ctx() as ctx: + my_vector_add() + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit b/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit deleted file mode 100644 index 81d333bfd4..0000000000 --- a/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai -// -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir -// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// CHECK: PASS! - diff --git a/test/objectFifo-stateful-transform/tileDMA_test.mlir b/test/objectFifo-stateful-transform/tileDMA_test.mlir index bea2793512..931972fd24 100644 --- a/test/objectFifo-stateful-transform/tileDMA_test.mlir +++ b/test/objectFifo-stateful-transform/tileDMA_test.mlir @@ -1,4 +1,4 @@ -//===- tileDMA_test.mlir --------------------------*- MLIR -*-===// +//===- tileDMA_test.mlir ---------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,7 +30,7 @@ // CHECK: %[[VAL_13:.*]] = aie.lock(%[[VAL_0]], 1) // CHECK: %[[VAL_14:.*]] = aie.buffer(%[[VAL_0]]) : memref<16xi32> // CHECK: %[[VAL_15:.*]] = aie.lock(%[[VAL_0]], 2) -// CHECK: aie.flow(%[[VAL_0]], DMA : 1, %[[VAL_1]], DMA : 0) +// CHECK: aie.flow(%[[VAL_0]], DMA : 0, %[[VAL_1]], DMA : 0) // CHECK: func.func @some_work(%[[VAL_16:.*]]: memref<16xi32>) { // CHECK: return // CHECK: } @@ -50,7 +50,7 @@ // CHECK: aie.end // CHECK: } // CHECK: %[[VAL_23:.*]] = aie.mem(%[[VAL_0]]) { -// CHECK: %[[VAL_24:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3) +// CHECK: %[[VAL_24:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb3) // CHECK: ^bb1: // 2 preds: ^bb0, ^bb2 // CHECK: aie.use_lock(%[[VAL_11]], Acquire, 1) // CHECK: aie.dma_bd(%[[VAL_10]] : memref<16xi32>, 0, 16) @@ -69,7 +69,7 @@ // CHECK: aie.use_lock(%[[VAL_15]], Release, 1) // CHECK: aie.next_bd ^bb4 // CHECK: ^bb5: // pred: ^bb3 -// CHECK: %[[VAL_26:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8) +// CHECK: %[[VAL_26:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8) // CHECK: ^bb6: // 2 preds: ^bb5, ^bb7 // CHECK: aie.use_lock(%[[VAL_8]], Acquire, 1) // CHECK: aie.dma_bd(%[[VAL_6]] : memref<16xi32>, 0, 16) @@ -134,7 +134,7 @@ module @tileDMA_channels { } %mem12 = aie.mem(%tile12) { - %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + %dma1 = aie.dma_start(MM2S, 1, ^bb1, ^bb3) ^bb1: aie.use_lock(%lock0, Acquire, 1) aie.dma_bd(%buff0 : memref<16xi32>, 0, 16) diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir new file mode 100644 index 0000000000..af95ba4abb --- /dev/null +++ b/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir @@ -0,0 +1,52 @@ +//===- tileDMA_test_bad.mlir -----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s + +// CHECK: error: 'aie.tile' op number of output DMA channel exceeded! + +module @tileDMA_channels { + aie.device(xcvc1902) { + %tile12 = aie.tile(1, 2) + %tile33 = aie.tile(3, 3) + + %buff0 = aie.buffer(%tile12) : memref<16xi32> + %lock0 = aie.lock(%tile12, 0) + %buff1 = aie.buffer(%tile12) : memref<16xi32> + %lock1 = aie.lock(%tile12, 1) + %buff2 = aie.buffer(%tile12) : memref<16xi32> + %lock2 = aie.lock(%tile12, 2) + + aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo> + + %mem12 = aie.mem(%tile12) { + %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: + aie.use_lock(%lock0, Acquire, 1) + aie.dma_bd(%buff0 : memref<16xi32>, 0, 16) + aie.use_lock(%lock0, Release, 0) + aie.next_bd ^bb2 + ^bb2: + aie.use_lock(%lock1, Acquire, 1) + aie.dma_bd(%buff1 : memref<16xi32>, 0, 16) + aie.use_lock(%lock1, Release, 0) + aie.next_bd ^bb1 + ^bb3: + %dma2 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: + aie.use_lock(%lock2, Acquire, 0) + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.use_lock(%lock2, Release, 1) + aie.next_bd ^bb4 + ^bb5: + aie.end + } + } +} diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir new file mode 100644 index 0000000000..0c7eca1763 --- /dev/null +++ b/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir @@ -0,0 +1,52 @@ +//===- tileDMA_test_bad2.mlir ----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s + +// CHECK: error: 'aie.tile' op number of input DMA channel exceeded! + +module @tileDMA_channels { + aie.device(xcvc1902) { + %tile12 = aie.tile(1, 2) + %tile33 = aie.tile(3, 3) + + %buff0 = aie.buffer(%tile12) : memref<16xi32> + %lock0 = aie.lock(%tile12, 0) + %buff1 = aie.buffer(%tile12) : memref<16xi32> + %lock1 = aie.lock(%tile12, 1) + %buff2 = aie.buffer(%tile12) : memref<16xi32> + %lock2 = aie.lock(%tile12, 2) + + aie.objectfifo @objfifo (%tile33, {%tile12}, 2 : i32) : !aie.objectfifo> + + %mem12 = aie.mem(%tile12) { + %dma1 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + ^bb1: + aie.use_lock(%lock0, Acquire, 1) + aie.dma_bd(%buff0 : memref<16xi32>, 0, 16) + aie.use_lock(%lock0, Release, 0) + aie.next_bd ^bb2 + ^bb2: + aie.use_lock(%lock1, Acquire, 1) + aie.dma_bd(%buff1 : memref<16xi32>, 0, 16) + aie.use_lock(%lock1, Release, 0) + aie.next_bd ^bb1 + ^bb3: + %dma2 = aie.dma_start(S2MM, 1, ^bb4, ^bb5) + ^bb4: + aie.use_lock(%lock2, Acquire, 0) + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.use_lock(%lock2, Release, 1) + aie.next_bd ^bb4 + ^bb5: + aie.end + } + } +} diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir new file mode 100644 index 0000000000..b84cc3e243 --- /dev/null +++ b/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir @@ -0,0 +1,63 @@ +//===- tileDMA_test_bad3.mlir ----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s + +// CHECK: error: 'aie.tile' op number of input DMA channel exceeded! + +module @tileDMA_channels { + aie.device(xcve2302) { + %tile11 = aie.tile(1, 1) + %tile33 = aie.tile(3, 3) + + %buff0 = aie.buffer(%tile11) : memref<16xi32> + %buff1 = aie.buffer(%tile11) : memref<16xi32> + %buff2 = aie.buffer(%tile11) : memref<16xi32> + %buff3 = aie.buffer(%tile11) : memref<16xi32> + %buff4 = aie.buffer(%tile11) : memref<16xi32> + %buff5 = aie.buffer(%tile11) : memref<16xi32> + + aie.objectfifo @objfifo (%tile33, {%tile11}, 2 : i32) : !aie.objectfifo> + + %mem11 = aie.memtile_dma(%tile11) { + %dma1 = aie.dma_start(S2MM, 0, ^bb1, ^bb2) + ^bb1: + aie.dma_bd(%buff0 : memref<16xi32>, 0, 16) + aie.next_bd ^bb1 + ^bb2: + %dma2 = aie.dma_start(S2MM, 1, ^bb3, ^bb4) + ^bb3: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb3 + ^bb4: + %dma3 = aie.dma_start(S2MM, 2, ^bb5, ^bb6) + ^bb5: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb5 + ^bb6: + %dma4 = aie.dma_start(S2MM, 3, ^bb7, ^bb8) + ^bb7: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb7 + ^bb8: + %dma5 = aie.dma_start(S2MM, 4, ^bb9, ^bb10) + ^bb9: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb9 + ^bb10: + %dma6 = aie.dma_start(S2MM, 5, ^bb11, ^bb12) + ^bb11: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb11 + ^bb12: + aie.end + } + } +} diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir new file mode 100644 index 0000000000..65bb4bfb60 --- /dev/null +++ b/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir @@ -0,0 +1,63 @@ +//===- tileDMA_test_bad4.mlir ----------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s + +// CHECK: error: 'aie.tile' op number of output DMA channel exceeded! + +module @tileDMA_channels { + aie.device(xcve2302) { + %tile11 = aie.tile(1, 1) + %tile33 = aie.tile(3, 3) + + %buff0 = aie.buffer(%tile11) : memref<16xi32> + %buff1 = aie.buffer(%tile11) : memref<16xi32> + %buff2 = aie.buffer(%tile11) : memref<16xi32> + %buff3 = aie.buffer(%tile11) : memref<16xi32> + %buff4 = aie.buffer(%tile11) : memref<16xi32> + %buff5 = aie.buffer(%tile11) : memref<16xi32> + + aie.objectfifo @objfifo (%tile11, {%tile33}, 2 : i32) : !aie.objectfifo> + + %mem11 = aie.memtile_dma(%tile11) { + %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb2) + ^bb1: + aie.dma_bd(%buff0 : memref<16xi32>, 0, 16) + aie.next_bd ^bb1 + ^bb2: + %dma2 = aie.dma_start(MM2S, 1, ^bb3, ^bb4) + ^bb3: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb3 + ^bb4: + %dma3 = aie.dma_start(MM2S, 2, ^bb5, ^bb6) + ^bb5: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb5 + ^bb6: + %dma4 = aie.dma_start(MM2S, 3, ^bb7, ^bb8) + ^bb7: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb7 + ^bb8: + %dma5 = aie.dma_start(MM2S, 4, ^bb9, ^bb10) + ^bb9: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb9 + ^bb10: + %dma6 = aie.dma_start(MM2S, 5, ^bb11, ^bb12) + ^bb11: + aie.dma_bd(%buff2 : memref<16xi32>, 0, 16) + aie.next_bd ^bb11 + ^bb12: + aie.end + } + } +}