diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 2150012455..8427335ae5 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -1677,7 +1677,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] OptionalAttr:$via_shared_mem, // repeat_count==1 means "do it once" OptionalAttr]>>:$repeat_count, - InitValuesArrayAttr:$initValues + InitValuesArrayAttr:$initValues, + OptionalAttr:$padDimensions ); let assemblyFormat = [{ @@ -1717,7 +1718,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol] OpBuilder<(ins "mlir::StringAttr":$sym_name, "mlir::Value":$producerTile, "mlir::ValueRange":$consumerTiles, "mlir::Attribute":$elemNumber, "mlir::Type":$elem_type, CArg<"llvm::ArrayRef", "{}">:$dimensionsToStream, - CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer), [{ + CArg<"llvm::ArrayRef", "{}">:$dimensionsFromStreamPerConsumer, + CArg<"llvm::ArrayRef", "{}">:$padDimensions), [{ odsState.addOperands(producerTile); odsState.addOperands(consumerTiles); odsState.addAttribute(getSymNameAttrName(odsState.name), sym_name); diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index ce04f131e0..2e1b2b1d32 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -570,7 +570,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ OptionalAttr:$packet, FlatSymbolRefAttr:$metadata, I64Attr:$id, - DefaultValuedOptionalAttr:$issue_token + DefaultValuedOptionalAttr:$issue_token, + DefaultValuedOptionalAttr:$d0_zero_before, + DefaultValuedOptionalAttr:$d1_zero_before, + DefaultValuedOptionalAttr:$d2_zero_before, + DefaultValuedOptionalAttr:$d0_zero_after, + DefaultValuedOptionalAttr:$d1_zero_after, + DefaultValuedOptionalAttr:$d2_zero_after ); let assemblyFormat = [{ @@ -828,6 +834,7 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { I32Attr:$d0_stride, I32Attr:$d1_size, I32Attr:$d1_stride, + I32Attr:$d2_size, I32Attr:$d2_stride, I32Attr:$iteration_current, I32Attr:$iteration_size, @@ -840,7 +847,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> { I32Attr:$lock_rel_id, I32Attr:$lock_acq_enable, I32Attr:$lock_acq_val, - I32Attr:$lock_acq_id + I32Attr:$lock_acq_id, + I32Attr:$d0_zero_before, + I32Attr:$d1_zero_before, + I32Attr:$d2_zero_before, + I32Attr:$d0_zero_after, + I32Attr:$d1_zero_after, + I32Attr:$d2_zero_after ); let results = (outs ); let assemblyFormat = [{ attr-dict }]; diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index 01b4fc9f4f..ff4c2a929e 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -1941,11 +1941,11 @@ LogicalResult DMABDOp::verify() { if (!dims.has_value()) return emitOpError() << "Padding requires n-d data layouts expressed as" << " wrap(s) and stride(s)."; + if (!targetModel.isMemTile(parentTileId.col, parentTileId.row)) + return emitOpError() << "Padding is only supported by memtile dma bds."; if (dims->size() != paddims->size()) return emitOpError() << "Mismatch number of dimensions between padding(s)" << " and wrap(s) and stride(s)."; - if (!targetModel.isMemTile(parentTileId.col, parentTileId.row)) - return emitOpError() << "Padding is only supported by memtile dma bds."; int actuallen = 1; for (unsigned i = 0; i < paddims->size(); i++) { auto dim = (*dims)[i]; diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index f520a80d6e..82691ccf29 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -513,14 +513,19 @@ struct AIEObjectFifoStatefulTransformPass void createBd(OpBuilder &builder, LockOp acqLock, int acqMode, LockAction acqLockAction, LockOp relLock, int relMode, MyOp buff, int offset, int len, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) { if (acqLock) builder.create(builder.getUnknownLoc(), acqLock, acqLockAction, acqMode); - if (!dims.getValue().empty()) + + if (!dims.getValue().empty() && padDimensions) { + builder.create(builder.getUnknownLoc(), buff, offset, len, dims, + padDimensions); + } else if (!dims.getValue().empty()) { builder.create(builder.getUnknownLoc(), buff, offset, len, dims); - else + } else { builder.create(builder.getUnknownLoc(), buff, offset, len); + } if (acqLock) builder.create(builder.getUnknownLoc(), relLock, LockAction::Release, relMode); @@ -534,7 +539,8 @@ struct AIEObjectFifoStatefulTransformPass void createBdBlock(OpBuilder &builder, ObjectFifoCreateOp op, int lockMode, int acqNum, int relNum, MyOp buff, int offset, int len, DMAChannelDir channelDir, size_t blockIndex, Block *succ, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { LockOp acqLock; LockOp relLock; int acqMode = 1; @@ -559,20 +565,23 @@ struct AIEObjectFifoStatefulTransformPass } } createBd(builder, acqLock, acqMode, acqLockAction, relLock, relMode, buff, - offset, len, succ, dims); + offset, len, succ, dims, padDimensions); } /// Function that either calls createAIETileDMA(), createShimDMA() or /// createMemTileDMA() based on op tile row value. void createDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr pad_dims) { if (op.getProducerTileOp().isShimTile()) { createShimDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); } else if (op.getProducerTileOp().isMemTile()) { + BDPadLayoutArrayAttr padDims = nullptr; + if (channelDir == DMAChannelDir::MM2S && pad_dims) + padDims = pad_dims; createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode, - dims); + dims, padDims); } else { createAIETileDMA(device, builder, op, channelDir, channelIndex, lockMode, dims); @@ -669,7 +678,7 @@ struct AIEObjectFifoStatefulTransformPass builder.setInsertionPointToStart(curr); createBdBlock(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], /*offset*/ 0, - len, channelDir, blockIndex, succ, dims); + len, channelDir, blockIndex, succ, dims, nullptr); curr = succ; blockIndex++; } @@ -745,7 +754,7 @@ struct AIEObjectFifoStatefulTransformPass createBdBlock(builder, op, lockMode, acqNum, relNum, externalBuffersPerFifo[op][blockIndex], /*offset*/ 0, len, channelDir, blockIndex, - succ, dims); + succ, dims, nullptr); curr = succ; blockIndex++; } @@ -756,7 +765,8 @@ struct AIEObjectFifoStatefulTransformPass void createMemTileDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op, DMAChannelDir channelDir, int channelIndex, int lockMode, - BDDimLayoutArrayAttr dims) { + BDDimLayoutArrayAttr dims, + BDPadLayoutArrayAttr padDimensions) { size_t numBlocks = op.size(); if (numBlocks == 0) return; @@ -898,7 +908,8 @@ struct AIEObjectFifoStatefulTransformPass offset = extraOffset; createBdBlock(builder, target, lockMode, acqNum, relNum, buffersPerFifo[target][blockIndex], offset, - lenOut, channelDir, blockIndex, succ, dims); + lenOut, channelDir, blockIndex, succ, dims, + padDimensions); curr = succ; blockIndex++; } @@ -1361,7 +1372,6 @@ struct AIEObjectFifoStatefulTransformPass auto consumerWireType = WireBundle::DMA; std::set objectFifoTiles; // track cores to check for loops during unrolling - //===------------------------------------------------------------------===// // Split objectFifos into a consumer end and producer end if needed //===------------------------------------------------------------------===// @@ -1511,7 +1521,8 @@ struct AIEObjectFifoStatefulTransformPass DMAChannel producerChan = dmaAnalysis.getMasterDMAChannel(producer.getProducerTile()); createDMA(device, builder, producer, producerChan.direction, - producerChan.channel, 0, producer.getDimensionsToStreamAttr()); + producerChan.channel, 0, producer.getDimensionsToStreamAttr(), + producer.getPadDimensionsAttr()); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); @@ -1529,7 +1540,7 @@ struct AIEObjectFifoStatefulTransformPass BDDimLayoutArrayAttr consumerDims = consumer.getDimensionsFromStreamPerConsumer()[0]; createDMA(device, builder, consumer, consumerChan.direction, - consumerChan.channel, 1, consumerDims); + consumerChan.channel, 1, consumerDims, nullptr); // generate objectFifo allocation info builder.setInsertionPoint(&device.getBody()->back()); diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index b7c68cbb18..e36b793662 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -459,6 +459,13 @@ LogicalResult AIEX::NpuWriteBdOp::verify() { return emitOpError("Iteration Size exceeds the [0:63] range."); if (getIterationStride() > 0xFFFFF) return emitOpError("Iteration Stride exceeds the [0:1M-1] range."); + if (targetModel.isShimNOCTile(getColumn(), getRow()) && getD2Size() != 0) + return emitOpError("ShimTile only supports 3 dimensions of sizes."); + if (targetModel.isShimNOCTile(getColumn(), getRow()) && + (getD0ZeroBefore() != 0 || getD0ZeroAfter() != 0 || + getD1ZeroBefore() != 0 || getD1ZeroAfter() != 0 || + getD2ZeroBefore() != 0 || getD2ZeroAfter() != 0)) + return emitOpError("ShimTile doesn't support zero padding."); return success(); } diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index f56f1cee3e..c064ad6702 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -129,7 +129,7 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { SmallVector{}, SmallVector{}, SmallVector{}, ArrayRef(staticOffsets), ArrayRef(staticSizes), ArrayRef(staticStrides), - controllerIdPkt, metadata, 0, true); + controllerIdPkt, metadata, 0, true, 0, 0, 0, 0, 0, 0); auto shimRow = builder.getI32IntegerAttr(0); auto shimCol = builder.getI32IntegerAttr(col); diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 8c889553da..39d9a55806 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -216,7 +216,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { } LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block, - AIE::TileOp &tile) { + AIE::TileOp &tile, + AIE::DMAChannelDir channelDir) { AIE::DMABDOp bd_op = getBdForBlock(block); const auto &target_model = AIE::getTargetModel(bd_op); MemRefType buffer_type = bd_op.getBuffer().getType(); @@ -237,12 +238,23 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { << len << " bytes falls below minimum hardware transfer unit of " << (addr_granularity / 8) << " bytes."; } - // Process strides/wraps std::optional> dims = bd_op.getDimensions(); llvm::SmallVector sizes = llvm::SmallVector(4, 0); llvm::SmallVector strides = llvm::SmallVector(4, 0); + + // Padding + std::optional> padDims = + bd_op.getPadDimensions(); + llvm::SmallVector padBefore = + llvm::SmallVector(4, 0); + llvm::SmallVector padAfter = + llvm::SmallVector(4, 0); + std::fill(padBefore.begin(), padBefore.end(), 0); + std::fill(padAfter.begin(), padAfter.end(), 0); + int d2size = 0; + if (dims && dims->size() > 0) { llvm::SmallVector input_sizes = llvm::SmallVector(4, 1); @@ -252,6 +264,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { return bd_op->emitOpError("At most four data layout transformation " "dimensions may be provided."); } + for (size_t i = 0; i < dims->size(); i++) { // Pass down dimensions in reverse order; in the MLIR, this allows // us to specify step sizes/wraps in the same order as we would @@ -260,6 +273,33 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { input_sizes[i] = (*dims)[j].getSize(); input_strides[i] = (*dims)[j].getStride(); } + if (dims->size() > 2) { + d2size = (target_model.isMemTile(tile.getCol(), tile.getRow())) + ? (*dims)[2].getSize() + : 0; + } + if (padDims.has_value()) { + if (!target_model.isMemTile(tile.getCol(), tile.getRow())) + return bd_op->emitOpError() + << "Padding is only supported by memtile dma bds."; + if (padDims->size() > dims->size()) + return bd_op->emitOpError() + << "Mismatch number of dimensions between padding(s)" + << " and wrap(s) and stride(s)."; + if (channelDir == AIE::DMAChannelDir::MM2S) { + for (size_t i = 0; i < padDims->size(); i++) { + int j = padDims->size() - i - 1; + padBefore[i] = (*padDims)[j].getConstPadBefore(); + padAfter[i] = (*padDims)[j].getConstPadAfter(); + } + for (size_t i = padDims->size(); i < dims->size(); i++) { + padBefore[i] = 0; + padAfter[i] = 0; + } + } else + return bd_op->emitOpError() + << "supports padding only for MM2S direction on MemTiles."; + } getHardwareStridesWraps(target_model, buffer_type, input_sizes, input_strides, sizes, strides); if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(), @@ -290,8 +330,16 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { "transfer length, as this is the BD repeat count."; return failure(); } + } else { + if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) && + channelDir == AIE::DMAChannelDir::MM2S) { + return bd_op->emitOpError() + << "Padding requires n-d data layouts expressed as " + << "wrap(s) and stride(s)."; + } else if (padDims) { + return bd_op->emitOpError() << "Padding is supported only on MemTiles."; + } } - // find next BD ID, if any uint32_t use_next_bd = 0; uint32_t next_bd_id = 0; @@ -306,7 +354,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /* TODO: Strides/Wraps */ /*d0_size=*/sizes[0], /*d0_stride=*/strides[0], /*d1_size=*/sizes[1], /*d1_stride=*/strides[1], - /*d2_stride=*/strides[2], + /*d2_size=*/d2size, /*d2_stride=*/strides[2], /*iteration_current=*/0, /*iteration_size=*/sizes[3], /*iteration_stride=*/strides[3], /* TODO: Next BD */ @@ -316,7 +364,10 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { /*valid_bd=*/1, /* TODO: Locks */ /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0, - /*lock_acq_val=*/0, /*lock_ackq_id=*/0); + /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/padBefore[0], + /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2], + /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1], + /*d2_zero_after=*/padAfter[2]); return setAddressForSingleBD(builder, bd_op, tile); } @@ -392,13 +443,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase { return failure(); } + auto channelDir = op.getDirection(); + // Lower all BDs for (auto it = body.begin(); it != body.end(); ++it) { Block &block = *it; if (shouldSkipBlock(block)) { continue; } - if (failed(rewriteSingleBD(builder, block, tile))) { + if (failed(rewriteSingleBD(builder, block, tile, channelDir))) { return failure(); } } diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index bd94422679..b18fd12ebe 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -317,6 +317,7 @@ struct DmaToNpuPattern : OpConversionPattern { auto d0_stride = zero; auto d1_size = zero; auto d1_stride = zero; + auto d2_size = zero; auto d2_stride = zero; auto iteration_current = zero; auto iteration_size = zero; @@ -330,6 +331,12 @@ struct DmaToNpuPattern : OpConversionPattern { auto lock_acq_enable = zero; auto lock_acq_val = zero; auto lock_acq_id = zero; + auto d0_zero_before = zero; + auto d1_zero_before = zero; + auto d2_zero_before = zero; + auto d0_zero_after = zero; + auto d1_zero_after = zero; + auto d2_zero_after = zero; auto issue_token = BoolAttr::get(ctx, false); auto repeat_count = zero; @@ -349,6 +356,9 @@ struct DmaToNpuPattern : OpConversionPattern { // column column = IntegerAttr::get(i32ty, col); + // row + row = IntegerAttr::get(i32ty, 0); + // arg_idx AIEX::RuntimeSequenceOp seq_op = op->getParentOfType(); @@ -405,6 +415,12 @@ struct DmaToNpuPattern : OpConversionPattern { // d2_stride d2_stride = IntegerAttr::get(i32ty, strides[2]); + + // d2_size + if (targetModel.isMemTile(col, 0)) // Need to be any row + d2_size = IntegerAttr::get(i32ty, sizes[2]); + else + d2_size = IntegerAttr::get(i32ty, 0); } // iteration_current, iteration_size, iteration_stride, repeat_count if (inputSizes[3] > 1) { @@ -439,6 +455,24 @@ struct DmaToNpuPattern : OpConversionPattern { // lock_acq_id + // d0_zero_before + d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore()); + + // d1_zero_before + d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore()); + + // d2_zero_before + d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore()); + + // d0_zero_after + d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter()); + + // d1_zero_after + d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter()); + + // d2_zero_after + d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter()); + // Set the issue_token issue_token = BoolAttr::get(ctx, op.getIssueToken()); // Earlier, all S2MM channels were implicitly assumed to issue a token. @@ -446,12 +480,20 @@ struct DmaToNpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); + if (targetModel.isMemTile(col, 0) && (!isMM2S) && + (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 || + op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 || + op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) + op->emitOpError("MemTile supports zero padding only on MM2S direction"); + rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, - d0_stride, d1_size, d1_stride, d2_stride, iteration_current, + d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, - lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); + lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, + d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, + d1_zero_after, d2_zero_after); uint64_t addr = getBufferDescriptorAddressRegisterAddress( targetModel, op.getId(), col, 0); @@ -573,6 +615,12 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[7] |= (op.getLockAcqEnable() & 0x1) << 12; words[7] |= (op.getLockAcqVal() & 0xef) << 5; words[7] |= op.getLockAcqId() & 0xf; + + if (op.getD0ZeroBefore() || op.getD1ZeroBefore() || + op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() || + op.getD2ZeroAfter()) { + op->emitError("Zero padding is only available on MemTile"); + } } else if (tm.isMemTile(op.getColumn(), op.getRow())) { bd_addr = (op.getColumn() << tm.getColumnShift()) | (op.getRow() << tm.getRowShift()) | (0xA0000 + bd_id * 0x20); @@ -584,6 +632,7 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { words[0] |= op.getBufferLength() & 0x1ffff; // DMA_BDX_1 + words[1] |= (op.getD0ZeroBefore() & 0x3F) << 26; words[1] |= (op.getNextBd() & 0x3f) << 20; words[1] |= (op.getUseNextBd() & 0x1) << 19; words[1] |= op.getBufferOffset() & 0x7ffff; @@ -594,15 +643,20 @@ struct WriteBdToBlockWritePattern : OpConversionPattern { // DMA_BDX_3 // TODO: Secure Access + words[3] |= (op.getD1ZeroBefore() & 0x1F) << 27; words[3] |= (op.getD1Size() & 0x3ff) << 17; words[3] |= op.getD1Stride() & 0x1ffff; // DMA_BDX_4 // TODO: D2Size + words[4] |= (op.getD2ZeroBefore() & 0xF) << 27; words[4] |= op.getD2Stride() & 0x1ffff; // DMA_BDX_5 // ToDO: D3Stride + words[5] |= (op.getD2ZeroAfter() & 0xF) << 28; + words[5] |= (op.getD1ZeroAfter() & 0x1F) << 23; + words[5] |= (op.getD0ZeroAfter() & 0x3F) << 17; // DMA_BDX_6 words[6] |= (op.getIterationCurrent() & 0x3f) << 23; diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index 9c11596119..3e227310cf 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -192,4 +192,4 @@ int main(int argc, const char *argv[]) { std::cout << std::endl << "fail." << std::endl << std::endl; return 1; } -} +} \ No newline at end of file diff --git a/python/dialects/aie.py b/python/dialects/aie.py index 712d763ea8..2821526d90 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -109,6 +109,12 @@ def bd_dim_layout(size, stride): return Attribute.parse(f"#aie.bd_dim_layout<{size=}, {stride=}>") +def bd_pad_layout(const_pad_before, const_pad_after): + return Attribute.parse( + f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>" + ) + + @register_attribute_builder("BDDimLayoutArrayAttr") def bd_dim_layout_array_attr_builder(tups: List[Attribute | Tuple[int]], context=None): if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups): @@ -127,6 +133,17 @@ def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context= ) +@register_attribute_builder("BDPadLayoutArrayAttr") +def bd_pad_layout_array_attr_builder( + tups: List[Union[Attribute, Tuple[int]]], context=None +): + if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups): + tups = list(map(lambda t: bd_pad_layout(*t), tups)) + return Attribute.parse( + f'#aie', context=context + ) + + @register_attribute_builder("AIEI1Attr") def _i1Attr(x, context): return IntegerAttr.get(IntegerType.get_signless(1, context=context), x) @@ -381,6 +398,7 @@ def __init__( initValues=None, via_DMA=None, plio=None, + padDimensions=None, disable_synchronization=None, ): self.datatype = try_convert_np_type_to_mlir_type(datatype) @@ -409,6 +427,7 @@ def __init__( dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer, via_DMA=via_DMA, plio=plio, + padDimensions=padDimensions, disable_synchronization=disable_synchronization, initValues=initValues, ) diff --git a/python/utils/trace.py b/python/utils/trace.py index 668455881e..8f1b4e6624 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -527,9 +527,16 @@ def configure_shimtile_tracing_aie2( column=int(shim.col), d0_size=0, d0_stride=0, + d0_zero_after=0, + d0_zero_before=0, d1_size=0, d1_stride=0, + d1_zero_after=0, + d1_zero_before=0, + d2_size=0, d2_stride=0, + d2_zero_after=0, + d2_zero_before=0, iteration_current=0, iteration_size=0, iteration_stride=0, diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir index 4ba0b41342..9ca60fc63d 100644 --- a/test/Targets/NPU/npu_blockwrite_instgen.mlir +++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir @@ -46,9 +46,16 @@ module { row = 1 : i32, d0_stride = 5 : i32, d0_size = 6 : i32, + d0_zero_after = 0 : i32, + d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 8 : i32, + d1_zero_after = 0 : i32, + d1_zero_before = 0 : i32, + d2_size = 1 : i32, d2_stride = 9 : i32, + d2_zero_after = 0 : i32, + d2_zero_before = 0 : i32, ddr_id = 10 : i32, iteration_current = 11 : i32, iteration_stride = 12 : i32, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir new file mode 100644 index 0000000000..8ff16ccaf1 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir @@ -0,0 +1,25 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, S2MM, 0) { + // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir new file mode 100644 index 0000000000..e530ec4762 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{Padding is only supported by memtile dma bds.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir new file mode 100644 index 0000000000..466c73b929 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir new file mode 100644 index 0000000000..45f95e0056 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir @@ -0,0 +1,27 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], [, ]) + {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir new file mode 100644 index 0000000000..3e58b8a5af --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir @@ -0,0 +1,26 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + // expected-error@+1 {{Padding is supported only on MemTiles.}} + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir index 2ad275b804..82ae4df6d1 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir @@ -17,13 +17,13 @@ module { %tile_2_0 = aie.tile(2, 0) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} @@ -40,5 +40,4 @@ module { aiex.dma_await_task(%t2) } } -} - +} \ No newline at end of file diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir index b57cbc81bd..094284c77f 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir @@ -16,11 +16,11 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} - // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} - // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32} diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir index 191f1511ee..36c828393c 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir @@ -16,7 +16,7 @@ module { %tile_0_2 = aie.tile(0, 2) aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32} %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { aie.dma_bd(%arg0 : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir index 22df05bca5..4c4e6e7a1a 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir @@ -19,7 +19,7 @@ module { %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> aiex.runtime_sequence(%arg0: memref<32xi8>) { - // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { aie.dma_bd(%buf : memref<32xi8>, 4, 16, diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir index 5a6519a4ee..f1fe68dda0 100644 --- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir @@ -16,14 +16,14 @@ module { aie.shim_dma_allocation @alloc1 (S2MM, 1, 2) aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) { - // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} %t1 = aiex.dma_configure_task_for @alloc0 { aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32} aie.end } {issue_token = true} - // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} - // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} + // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} %t2 = aiex.dma_configure_task_for @alloc1 { aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32} aie.end diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir new file mode 100644 index 0000000000..aa72ecfe7f --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], []) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir new file mode 100644 index 0000000000..e2f603fff2 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], [, , ]) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-9.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-9.mlir new file mode 100644 index 0000000000..dbbf1b99d0 --- /dev/null +++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-9.mlir @@ -0,0 +1,28 @@ +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s + +module { + aie.device(npu1_4col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> + + aiex.runtime_sequence(%arg0: memref<32xi8>) { + // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_size = 4 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32} + %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) { + aie.dma_bd(%buf : memref<32xi8>, 4, 16, + [, , ], [, ]) {bd_id = 0 : i32} + aie.end + } + } + } +} + diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir index 383f6ac567..42bd6bdf26 100644 --- a/test/dialect/AIEX/bad_npu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -15,7 +15,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -26,7 +26,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -37,7 +37,7 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } @@ -48,7 +48,51 @@ module { aie.device(npu1_4col) { aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile only supports 3 dimensions of sizes.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 100 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 2 : i32, d2_zero_before = 1 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + } + } +} + +// ----- + +module { + aie.device(npu1_4col) { + aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + // expected-error@+1 {{ShimTile doesn't support zero padding.}} + aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 512 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_size = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} } } } \ No newline at end of file diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir index a88877e659..a9a28b94f2 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir @@ -577,7 +577,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -587,7 +587,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -597,7 +597,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 6735 : ui32} // events: 0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -607,7 +607,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 6735 : ui32} // events:0x00 00 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -617,7 +617,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -627,7 +627,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir index 09f1eac02f..c21ee5b652 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir @@ -425,7 +425,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -435,7 +435,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -445,7 +445,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -455,7 +455,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -465,7 +465,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -475,7 +475,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir index be13d7523c..0c8e533e80 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir @@ -171,7 +171,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -181,7 +181,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir index 1a78398e26..17a4ec3c8c 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir @@ -451,7 +451,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 3 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 3 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 3 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 15 : i32, buffer_length = 8192 : i32, buffer_offset = 25600 : i32, column = 3 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 3: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 3 : i32, row = 0 : i32, value = 15 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 2 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -461,7 +461,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 2 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 2 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 2 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 14 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 2: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 14 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 1 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -471,7 +471,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 1 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 1 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 1 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 13 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 1: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 13 : ui32} aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15) @@ -481,7 +481,7 @@ module { aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} // events: 0x2D(lock release) 2C(lock acquire) 1A(lock stall) 4F(port1 run) aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} // [13:8] port1 MM2S-0+1, [5:0] port0 S2MM-0+1 aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 12 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0: i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 12 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 2 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -491,7 +491,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 2 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 2 : i32, row = 1 : i32, value = 589439264 : ui32} // [29:24] port3 S2MM-3, [21:16] port2 S2MM-2, [13:8] port1 S2MM-1, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 2 : i32, row = 1 : i32, value = 0 : ui32} // [5:0] port4 MM2S-0 - aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 11 : i32, buffer_length = 8192 : i32, buffer_offset = 17408 : i32, column = 2 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 6: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 2 : i32, row = 0 : i32, value = 11 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 1 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -501,7 +501,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 1 : i32, row = 1 : i32, value = 1415076960 : ui32} // events: 0x54(port1 run) 58(port2 run) 5C(port3 run) 60(port4 run) aiex.npu.write32 {address = 724736 : ui32, column = 1 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 1 : i32, row = 1 : i32, value = 3 : ui32} // [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 10 : i32, buffer_length = 8192 : i32, buffer_offset = 9216 : i32, column = 1 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 5: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 1 : i32, row = 0 : i32, value = 10 : ui32} aiex.npu.write32 {address = 606208 : ui32, column = 0 : i32, row = 1 : i32, value = 40192 : ui32} // [15:8] reset event: 157(BROADCAST_15) @@ -511,7 +511,7 @@ module { aiex.npu.write32 {address = 606436 : ui32, column = 0 : i32, row = 1 : i32, value = 1549821032 : ui32} // events: 5C(port3 run) 60(port4 run) 64(port5 run) 68(port6 run) aiex.npu.write32 {address = 724736 : ui32, column = 0 : i32, row = 1 : i32, value = 33620000 : ui32} // [29:24] port3 MM2S-2, [21:16] port2 MM2S-1, [13:8] port1 MM2S-0, [5:0] port0 S2MM-0 aiex.npu.write32 {address = 724740: ui32, column = 0 : i32, row = 1 : i32, value = 270595 : ui32} // [21:16] port6 MM2S-4, [13:8] port5 S2MM-1, [5:0] port4 MM2S-3 - aiex.npu.writebd {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 9 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, row = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 2 : i32, enable_packet = 1 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 4: i32, packet_type = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 9 : ui32} aiex.npu.write32 {address = 212992: ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} // [14:8] reset event: 127(USER_EVENT_1) diff --git a/test/npu-xrt/memtile_dmas/writebd/aie.mlir b/test/npu-xrt/memtile_dmas/writebd/aie.mlir index eb414000be..047cbe33e6 100644 --- a/test/npu-xrt/memtile_dmas/writebd/aie.mlir +++ b/test/npu-xrt/memtile_dmas/writebd/aie.mlir @@ -20,14 +20,14 @@ module { aie.flow(%tile_0_1, DMA : 0, %tile_0_0, DMA : 0) aie.shim_dma_allocation @in(MM2S, 0, 0) aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 2 : i32, arg_plus = 0 : i32} aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483648 : ui32} - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 64 : i32, lock_acq_val = 127 : i32, lock_rel_id = 65 : i32, lock_rel_val = 1 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 64 : i32, lock_acq_val = 127 : i32, lock_rel_id = 65 : i32, lock_rel_val = 1 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 656900 : ui32, column = 0 : i32, row = 1 : i32, value = 0 : ui32} - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 65 : i32, lock_acq_val = 127 : i32, lock_rel_id = 64 : i32, lock_rel_val = 1 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 65 : i32, lock_acq_val = 127 : i32, lock_rel_id = 64 : i32, lock_rel_val = 1 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 656948 : ui32, column = 0 : i32, row = 1 : i32, value = 1 : ui32} - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} aiex.npu.maskwrite32 {address = 119296 : ui32, column = 0 : i32, row = 0 : i32, mask = 0x00000F00 : ui32, value = 0x100 : ui32} aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32} diff --git a/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir b/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir index 10e2b0b707..78c9656d88 100644 --- a/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir +++ b/test/npu-xrt/memtile_dmas/writebd_tokens/aie.mlir @@ -26,18 +26,18 @@ module { aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { // BD0, DMA_S2MM_0_Task_Queue - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 2 : i32, arg_plus = 0 : i32} aiex.npu.maskwrite32 {address = 0x1d200 : ui32, column = 0 : i32, row = 0 : i32, mask = 0x00000F00 : ui32, value = 0x200 : ui32} aiex.npu.write32 {address = 0x1d204 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} // BD1, DMA_MM2S_0_Task_Queue - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.address_patch {addr = 0x1d024 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32} aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32} // BD0, DMA_S2MM_0_Start_Queue - aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.maskwrite32 {address = 0xa0600 : ui32, column = 0 : i32, row = 1 : i32, mask = 0x00000F00 : ui32, value = 0x100 : ui32} aiex.npu.write32 {address = 0xa0604 : ui32, column = 0 : i32, row = 1 : i32, value = 0x80000000 : ui32} @@ -45,7 +45,7 @@ module { aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 1 : i32, row_num = 1 : i32} // BD1, DMA_MM2S_0_Start_Queue - aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 4096 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} aiex.npu.write32 {address = 0xa0634 : ui32, column = 0 : i32, row = 1 : i32, value = 1 : ui32} // sync with the copy out via shimdma diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py index bb0bdb4203..e42fcb87e7 100644 --- a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py +++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py @@ -89,9 +89,16 @@ def sequence(input, output): iteration_stride=0, d0_size=0, d0_stride=0, + d0_zero_after=0, + d0_zero_before=0, d1_size=0, d1_stride=0, + d1_zero_after=0, + d1_zero_before=0, + d2_size=0, d2_stride=0, + d2_zero_after=0, + d2_zero_before=0, enable_packet=0, out_of_order_id=0, packet_id=0, diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir new file mode 100644 index 0000000000..0fe3b75af7 --- /dev/null +++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir @@ -0,0 +1,182 @@ +//===- memtile_padding_test.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + // CHECK: %tile_0_0 = aie.tile(0, 0) + // CHECK: %tile_0_1 = aie.tile(0, 1) + // CHECK: %tile_0_2 = aie.tile(0, 2) + // CHECK: %objFifo_out0_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "objFifo_out0_cons_prod_lock"} + // CHECK: %objFifo_out0_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_cons_lock"} + // CHECK: %objFifo_out1_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_cons_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out1_cons_prod_lock"} + // CHECK: %objFifo_out1_cons_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_cons_lock"} + // CHECK: %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"} + // CHECK: %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"} + // CHECK: %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"} + // CHECK: %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"} + // CHECK: %objFifo_in1_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_0"} : memref<64x64xi8> + // CHECK: %objFifo_in1_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_1"} : memref<64x64xi8> + // CHECK: %objFifo_in1_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in1_prod_lock"} + // CHECK: %objFifo_in1_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_lock"} + // CHECK: %objFifo_in0_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "objFifo_in0_prod_lock"} + // CHECK: %objFifo_in0_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_lock"} + // CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) + // CHECK: aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + // CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) + // CHECK: aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) + // CHECK: %core_0_2 = aie.core(%tile_0_2) { + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: %c0 = arith.constant 0 : index + // CHECK: %c1 = arith.constant 1 : index + // CHECK: %c64 = arith.constant 64 : index + // CHECK: %c12_i8 = arith.constant 12 : i8 + // CHECK: scf.for %arg0 = %c0 to %c64 step %c1 { + // CHECK: scf.for %arg1 = %c0 to %c64 step %c1 { + // CHECK: %0 = memref.load %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: %1 = arith.addi %0, %c12_i8 : i8 + // CHECK: memref.store %1, %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8> + // CHECK: } + // CHECK: } + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) + // CHECK: aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8> + // CHECK: aiex.npu.dma_wait {symbol = @objFifo_out0} + // CHECK: } + // CHECK: %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: %2 = aie.dma_start(S2MM, 1, ^bb7, ^bb9) + // CHECK: ^bb7: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb8 + // CHECK: ^bb8: + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb7 + // CHECK: ^bb9: + // CHECK: %3 = aie.dma_start(MM2S, 1, ^bb10, ^bb12) + // CHECK: ^bb10: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb11 + // CHECK: ^bb11: + // CHECK: aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096, [, ], [, ]) + // CHECK: aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb10 + // CHECK: ^bb12: + // CHECK: aie.end + // CHECK: } + // CHECK: aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + // CHECK: %mem_0_2 = aie.mem(%tile_0_2) { + // CHECK: %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + // CHECK: ^bb1: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb2 + // CHECK: ^bb2: + // CHECK: aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + // CHECK: aie.next_bd ^bb1 + // CHECK: ^bb3: + // CHECK: %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6) + // CHECK: ^bb4: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb5 + // CHECK: ^bb5: + // CHECK: aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + // CHECK: aie.dma_bd(%objFifo_out1_buff_1 : memref<64x64xi8>, 0, 4096) + // CHECK: aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + // CHECK: aie.next_bd ^bb4 + // CHECK: ^bb6: + // CHECK: aie.end + // CHECK: } + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @objFifo_in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ([] []) + aie.objectfifo @objFifo_out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 2 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ([] []) + %core_0_2 = aie.core(%tile_0_2) { + %subview = aie.objectfifo.acquire @objFifo_in1 (Consume, 1) : !aie.objectfifosubview> + %subview1 = aie.objectfifo.acquire @objFifo_out1 (Produce, 1) : !aie.objectfifosubview> + %elem = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + %elem1 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview> -> memref<64x64xi8> + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c12_i8 = arith.constant 12 : i8 + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %elem[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %elem1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.objectfifo.release @objFifo_in1 (Consume, 1) + aie.objectfifo.release @objFifo_out1 (Produce, 1) + aie.end + } + + aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} \ No newline at end of file diff --git a/test/objectFifo-stateful-transform/nd_dma_base_AIE2.mlir b/test/objectFifo-stateful-transform/nd_dma_base_AIE2.mlir index b44f634192..7e05ad1466 100644 --- a/test/objectFifo-stateful-transform/nd_dma_base_AIE2.mlir +++ b/test/objectFifo-stateful-transform/nd_dma_base_AIE2.mlir @@ -133,7 +133,7 @@ module @ndDMAObjFifoAIE2 { // this case between two adjacent tiles, we need to use DMAs if a data // layout transformation with dimensionsToStream and dimensionsFromStream was specified. aie.objectfifo @of0 (%tile12 dimensionsToStream [, , ], // transpose - {%tile13 dimensionsFromStream []}, + {%tile13 dimensionsFromStream []}, 4 : i32) : !aie.objectfifo> aie.objectfifo @of1 (%tile12 dimensionsToStream [], {%tile33}, diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index c3a02a201a..f9d57e2108 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -13,7 +13,7 @@ # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32} # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py new file mode 100644 index 0000000000..8257a18271 --- /dev/null +++ b/test/python/zero_pad.py @@ -0,0 +1,71 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# RUN: %python %s | FileCheck %s +# CHECK: aie.objectfifo @out(%tile_0_1 dimensionsToStream [, ], {%tile_0_0}, 1 : i32) {padDimensions = #aie, ]>} : !aie.objectfifo> +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ + +N = 56 +dev = AIEDevice.npu1_1col +col = 0 + +if len(sys.argv) > 1: + N = int(sys.argv[1]) + +if len(sys.argv) > 2: + if sys.argv[2] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[2] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2])) + +if len(sys.argv) > 3: + col = int(sys.argv[3]) + + +def my_passthrough(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + memRef_ty = T.memref(25, T.i32()) + memRef_ty2 = T.memref(56, T.i32()) + + # Tile declarations + ShimTile = tile(col, 0) + MemTile = tile(col, 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty) + of_out = object_fifo( + "out", + MemTile, + ShimTile, + 1, + memRef_ty2, + dimensionsToStream=[(5, 5), (5, 5)], + padDimensions=[(2, 0), (3, 0)], + ) + object_fifo_link(of_in, of_out) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True + ) + npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N]) + dma_wait(of_in, of_out) + + print(ctx.module) + + +my_passthrough()