diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index 429089cdd..0f28d3125 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,iree-amdaie-assign-tiles-to-objectfifo,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTilesToObjectFifo.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTilesToObjectFifo.cpp new file mode 100644 index 000000000..3904bc29a --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTilesToObjectFifo.cpp @@ -0,0 +1,345 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-assign-tiles-to-objectfifo" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Utility to recursively find users of the provided logical objectFifo inside +/// `amdaie.core` operations and return the tile coordinates. +LogicalResult findUsersInCoreAndAddTiles( + Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + llvm::SmallSetVector, 16> &tiles) { + for (Operation *userOp : op->getUsers()) { + if (auto coreOp = userOp->getParentOfType()) { + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) { + return coreOp.emitOpError() << "has non-constant tile location"; + } + tiles.insert(std::make_pair(column.value(), row.value())); + } + if (auto subviewOp = dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); + } else if (auto userLogicalObjectFifo = + dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(userLogicalObjectFifo, + logicalObjectFifo, tiles); + } + } + return success(); +} + +/// Assign tiles to the logical objectfifos with local memory space (L1). +/// The tiles are derived from the usage of the logical objectfifos within +/// core operations, which are already assigned a tile location. +LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + WalkResult res = moduleOp->walk( + [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 2) + return WalkResult::advance(); + + llvm::SmallSetVector, 16> tileLocations; + if (failed(findUsersInCoreAndAddTiles( + logicalObjectFifo, logicalObjectFifo, tileLocations))) { + return WalkResult::interrupt(); + } + // Handle subviews. + for (Operation *userOp : + logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { + if (auto subviewOp = dyn_cast(userOp)) { + if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, + tileLocations))) { + return WalkResult::interrupt(); + } + } + } + + SmallVector tiles; + tiles.reserve(tileLocations.size()); + rewriter.setInsertionPoint(logicalObjectFifo); + for (auto [column, row] : tileLocations) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), row); + auto tileOp = rewriter.create( + rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // Sort for deterministic output IR. + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tiles); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +/// Return the tiles of the sources respectively targets of the users of this +/// logical objectfifo, depending on whether the OperateOn template parameter is +/// set to `OperateOn::Source` respectively `OperateOn::Target`. +template +LogicalResult getUserTiles( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + SmallVectorImpl &tiles) { + llvm::SmallSetVector tileSet; + for (Operation *user : logicalObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(user)) { + ValueRange tileIndices; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getSourceObjectFifo().getTiles(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getTargetObjectFifo().getTiles(); + } + + // Only fill in tiles when all sources have tiles. + if (tileIndices.empty()) return failure(); + for (Value index : tileIndices) { + tileSet.insert( + dyn_cast_if_present(index.getDefiningOp())); + } + } + } + tiles = tileSet.takeVector(); + return success(); +} + +/// Assign a set of potential physical AIE tiles to logical objectFifos. This +/// rewrite takes an iterative approach by matching logical objectfifos and only +/// assigning tiles when linked through dma ops with other logical objectfifos +/// which already have tiles assigned. If the linked logical objectfifos don't +/// have tiles assigned yet, we will return a failure and give the linked +/// logical objectfifos a chance to assign tiles before returning to this one. +/// +/// TODO(jornt): There are decisions being made in this pass on which tiles to +/// assign to a logical objectfifo. This logic is very simple for now and tries +/// to use the tiles in the same columns as targets and sources. At some point, +/// we probably need some AIE device model to guide the assignement here for +/// performance and to avoid hardware resource issues later on. +class FillAieTiles + : public OpRewritePattern { + using OpRewritePattern< + AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; + + LogicalResult matchAndRewrite( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + PatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n"); + if (!logicalObjectFifo.getTiles().empty()) { + return failure(); + } + + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + // Skip logical objectfifos within local memory as they should already be + // assigned. + if (memSpace && dyn_cast(memSpace).getInt() == 2) { + if (logicalObjectFifo.getTiles().empty()) { + logicalObjectFifo.emitOpError() + << "found logical objectfifo on local memory space with no tiles " + "assigned."; + } + return failure(); + } + // HandLe both L3/shim and L2/Memtiles. + // Skip logical objectfifos within non-global and non-shared memory. + if (memSpace && dyn_cast(memSpace).getInt() != 1) { + return logicalObjectFifo.emitOpError() + << "found logical objectfifo with unknown memory space"; + } + + SmallVector targetTiles; + SmallVector sourceTiles; + LogicalResult dstRes = + getUserTiles(logicalObjectFifo, targetTiles); + LogicalResult srcRes = + getUserTiles(logicalObjectFifo, sourceTiles); + + // If no source and target tiles found, skip. + if (failed(dstRes) && failed(srcRes)) { + return failure(); + } + + // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie + // target model for this. + int64_t rowInt = memSpace ? 1 : 0; + llvm::SmallSetVector, 16> tileLocations; + auto createTileLocations = + [&](SmallVector &tiles) -> LogicalResult { + // TODO(jornt): For now, for deterministic behaviour, sort on column + // index and use first one. This needs to be generalized to assign + // tiles based on a resource model. + std::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + // Erase duplicates. + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); + for (AMDAIE::TileOp tile : tiles) { + std::optional column = getConstantIntValue(tile.getCol()); + if (!column) return tile.emitOpError() << "found non-constant column"; + tileLocations.insert(std::make_pair(column.value(), rowInt)); + } + return success(); + }; + + if (!targetTiles.empty() && !sourceTiles.empty()) { + return logicalObjectFifo.emitOpError() + << "found logical objectfifo with both source and target tiles, " + "which is not supported yet"; + } else if (!targetTiles.empty()) { + // Create tile locations for this logical objectfifo based on target + // tiles. + if (failed(createTileLocations(targetTiles))) { + return failure(); + } + } else if (!sourceTiles.empty()) { + // Create tile locations for this logical objectfifo based on source + // tiles. + if (failed(createTileLocations(sourceTiles))) { + return failure(); + } + } else { + // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait + // for other logical objectfifos to be assigned first. + return failure(); + } + + // If no tile results, skip, and maybe in a next iteration another tile will + // be found. + if (tileLocations.empty()) { + return failure(); + } + + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, logicalObjectFifo.getMemref(), + tileLocations.takeVector()); + return success(); + } +}; + +/// Assign specific tile locations to objectFifos, starting from the set of +/// potential tile locations filled in earlier. +LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (memSpace && dyn_cast(memSpace).getInt() != 1) + return WalkResult::advance(); + + SmallVector tiles = + llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { + return dyn_cast_if_present(tile.getDefiningOp()); + }); + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + + // For now, use first tile in sorted list. + // TODO(jornt): This will need to become more complex in the future to + // account for potential hardware limitations and constraints. + SmallVector tileResults = {cast(tiles[0].getResult())}; + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast(logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); + return WalkResult::advance(); + }); + return success(); +} + +class AMDAIEAssignTilesToObjectFifoPass + : public impl::AMDAIEAssignTilesToObjectFifoBase< + AMDAIEAssignTilesToObjectFifoPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + AMDAIEAssignTilesToObjectFifoPass() = default; + AMDAIEAssignTilesToObjectFifoPass( + const AMDAIEAssignTilesToObjectFifoPass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEAssignTilesToObjectFifoPass::runOnOperation() { + MLIRContext *context = &getContext(); + ModuleOp moduleOp = getOperation(); + // IRRewriter rewriter(parentOp); + // parentOp->walk([&](func::FuncOp funcOp) { /* do something */ }); + + // Assign tile locations to logical objectfifos on local (L1) memory. + if (failed(assignLocalAieTiles(moduleOp))) { + moduleOp.emitOpError() << "local tile assignment failed"; + return signalPassFailure(); + } + + if (failed(verify(moduleOp, true))) { + return signalPassFailure(); + } + + LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n" + << moduleOp << "\n"); + + // Assign a set of potential tile locations to the remaining logical + // objectFifos. + RewritePatternSet assignAieTilePatters(context); + assignAieTilePatters.insert(context); + if (failed(applyPatternsAndFoldGreedily(moduleOp, + std::move(assignAieTilePatters)))) { + moduleOp.emitOpError() + << "collection of tile candidates for logical objectFifos failed"; + return signalPassFailure(); + } + + if (failed(verify(moduleOp, true))) { + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n" + << moduleOp << "\n"); + + // Assign specific tile locations to objectFifos, starting from the set of + // potential tile locations filled in earlier. + if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) { + moduleOp.emitOpError() + << "tile assignment and logical objectFifo distribution failed"; + return signalPassFailure(); + } + + if (failed(verify(moduleOp, true))) { + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() + << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n" + << moduleOp << "\n"); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignTilesToObjectFifoPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index dbd439458..083211e32 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -323,37 +323,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern { } }; -/// Return the tiles of the sources respectively targets of the users of this -/// logical objectfifo, depending on whether the OperateOn template parameter is -/// set to `OperateOn::Source` respectively `OperateOn::Target`. -template -LogicalResult getUserTiles( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - SmallVectorImpl &tiles) { - llvm::SmallSetVector tileSet; - for (Operation *user : logicalObjectFifo->getUsers()) { - if (auto dmaOp = dyn_cast(user)) { - ValueRange tileIndices; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getSourceObjectFifo().getTiles(); - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getTargetObjectFifo().getTiles(); - } - - // Only fill in tiles when all sources have tiles. - if (tileIndices.empty()) return failure(); - for (Value index : tileIndices) { - tileSet.insert( - dyn_cast_if_present(index.getDefiningOp())); - } - } - } - tiles = tileSet.takeVector(); - return success(); -} - /// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the /// memrefs from logical objectfifos and update the computational operations to /// operate on these local memrefs. @@ -454,229 +423,6 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { return success(); } -/// Utility to recursively find users of the provided logical objectFifo inside -/// `amdaie.core` operations and return the tile coordinates. -LogicalResult findUsersInCoreAndAddTiles( - Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - llvm::SmallSetVector, 16> &tiles) { - for (Operation *userOp : op->getUsers()) { - if (auto coreOp = userOp->getParentOfType()) { - AMDAIE::TileOp tileOp = coreOp.getTileOp(); - std::optional column = getConstantIntValue(tileOp.getCol()); - std::optional row = getConstantIntValue(tileOp.getRow()); - if (!column || !row) { - return coreOp.emitOpError() << "has non-constant tile location"; - } - tiles.insert(std::make_pair(column.value(), row.value())); - } - if (auto subviewOp = dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); - } else if (auto userLogicalObjectFifo = - dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(userLogicalObjectFifo, - logicalObjectFifo, tiles); - } - } - return success(); -} - -/// Assign tiles to the logical objectfifos with local memory space (L1). -/// The tiles are derived from the usage of the logical objectfifos within -/// core operations, which are already assigned a tile location. -LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - WalkResult res = moduleOp->walk( - [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (!memSpace || dyn_cast(memSpace).getInt() != 2) - return WalkResult::advance(); - - llvm::SmallSetVector, 16> tileLocations; - if (failed(findUsersInCoreAndAddTiles( - logicalObjectFifo, logicalObjectFifo, tileLocations))) { - return WalkResult::interrupt(); - } - // Handle subviews. - for (Operation *userOp : - logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { - if (auto subviewOp = dyn_cast(userOp)) { - if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, - tileLocations))) { - return WalkResult::interrupt(); - } - } - } - - SmallVector tiles; - tiles.reserve(tileLocations.size()); - rewriter.setInsertionPoint(logicalObjectFifo); - for (auto [column, row] : tileLocations) { - auto colIndex = rewriter.create( - rewriter.getUnknownLoc(), column); - auto rowIndex = rewriter.create( - rewriter.getUnknownLoc(), row); - auto tileOp = rewriter.create( - rewriter.getUnknownLoc(), colIndex, rowIndex); - tiles.push_back(tileOp.getResult()); - } - // Sort for deterministic output IR. - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileValueColumnAndRowComparator); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast( - logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tiles); - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - -/// Assign a set of potential physical AIE tiles to logical objectFifos. This -/// rewrite takes an iterative approach by matching logical objectfifos and only -/// assigning tiles when linked through dma ops with other logical objectfifos -/// which already have tiles assigned. If the linked logical objectfifos don't -/// have tiles assigned yet, we will return a failure and give the linked -/// logical objectfifos a chance to assign tiles before returning to this one. -/// -/// TODO(jornt): There are decisions being made in this pass on which tiles to -/// assign to a logical objectfifo. This logic is very simple for now and tries -/// to use the tiles in the same columns as targets and sources. At some point, -/// we probably need some AIE device model to guide the assignement here for -/// performance and to avoid hardware resource issues later on. -class FillAieTiles - : public OpRewritePattern { - using OpRewritePattern< - AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; - - LogicalResult matchAndRewrite( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - PatternRewriter &rewriter) const override { - LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n"); - if (!logicalObjectFifo.getTiles().empty()) { - return failure(); - } - - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - // Skip logical objectfifos within local memory as they should already be - // assigned. - if (memSpace && dyn_cast(memSpace).getInt() == 2) { - if (logicalObjectFifo.getTiles().empty()) { - logicalObjectFifo.emitOpError() - << "found logical objectfifo on local memory space with no tiles " - "assigned."; - } - return failure(); - } - // HandLe both L3/shim and L2/Memtiles. - // Skip logical objectfifos within non-global and non-shared memory. - if (memSpace && dyn_cast(memSpace).getInt() != 1) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with unknown memory space"; - } - - SmallVector targetTiles; - SmallVector sourceTiles; - LogicalResult dstRes = - getUserTiles(logicalObjectFifo, targetTiles); - LogicalResult srcRes = - getUserTiles(logicalObjectFifo, sourceTiles); - - // If no source and target tiles found, skip. - if (failed(dstRes) && failed(srcRes)) { - return failure(); - } - - // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie - // target model for this. - int64_t rowInt = memSpace ? 1 : 0; - llvm::SmallSetVector, 16> tileLocations; - auto createTileLocations = - [&](SmallVector &tiles) -> LogicalResult { - // TODO(jornt): For now, for deterministic behaviour, sort on column - // index and use first one. This needs to be generalized to assign - // tiles based on a resource model. - std::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - // Erase duplicates. - tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); - for (AMDAIE::TileOp tile : tiles) { - std::optional column = getConstantIntValue(tile.getCol()); - if (!column) return tile.emitOpError() << "found non-constant column"; - tileLocations.insert(std::make_pair(column.value(), rowInt)); - } - return success(); - }; - - if (!targetTiles.empty() && !sourceTiles.empty()) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with both source and target tiles, " - "which is not supported yet"; - } else if (!targetTiles.empty()) { - // Create tile locations for this logical objectfifo based on target - // tiles. - if (failed(createTileLocations(targetTiles))) { - return failure(); - } - } else if (!sourceTiles.empty()) { - // Create tile locations for this logical objectfifo based on source - // tiles. - if (failed(createTileLocations(sourceTiles))) { - return failure(); - } - } else { - // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait - // for other logical objectfifos to be assigned first. - return failure(); - } - - // If no tile results, skip, and maybe in a next iteration another tile will - // be found. - if (tileLocations.empty()) { - return failure(); - } - - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, logicalObjectFifo.getMemref(), - tileLocations.takeVector()); - return success(); - } -}; - -/// Assign specific tile locations to objectFifos, starting from the set of -/// potential tile locations filled in earlier. -LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (memSpace && dyn_cast(memSpace).getInt() != 1) - return WalkResult::advance(); - - SmallVector tiles = - llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { - return dyn_cast_if_present(tile.getDefiningOp()); - }); - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - - // For now, use first tile in sorted list. - // TODO(jornt): This will need to become more complex in the future to - // account for potential hardware limitations and constraints. - SmallVector tileResults = {cast(tiles[0].getResult())}; - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast(logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tileResults); - return WalkResult::advance(); - }); - return success(); -} - class AMDAIEDistributeCoresAndObjectFifosPass : public impl::AMDAIEDistributeCoresAndObjectFifosBase< AMDAIEDistributeCoresAndObjectFifosPass> { @@ -748,51 +494,6 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { } LLVM_DEBUG(llvm::dbgs() << "Module after insertLogicalObjectFifoAccess: \n" << moduleOp << "\n"); - - // Assign tile locations to logical objectfifos on local (L1) memory. - if (failed(assignLocalAieTiles(moduleOp))) { - moduleOp.emitOpError() << "local tile assignment failed"; - return signalPassFailure(); - } - - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - - LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n" - << moduleOp << "\n"); - - // Assign a set of potential tile locations to the remaining logical - // objectFifos. - RewritePatternSet assignAieTilePatters(context); - assignAieTilePatters.insert(context); - if (failed(applyPatternsAndFoldGreedily(moduleOp, - std::move(assignAieTilePatters)))) { - moduleOp.emitOpError() - << "collection of tile candidates for logical objectFifos failed"; - return signalPassFailure(); - } - - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n" - << moduleOp << "\n"); - - // Assign specific tile locations to objectFifos, starting from the set of - // potential tile locations filled in earlier. - if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) { - moduleOp.emitOpError() - << "tile assignment and logical objectFifo distribution failed"; - return signalPassFailure(); - } - - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() - << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n" - << moduleOp << "\n"); } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index a180f5f37..0450a6f3a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -105,6 +105,7 @@ iree_cc_library( "AMDAIETemporaryAllocBufferization.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" + "AMDAIEAssignTilesToObjectFifo.cpp" "AMDAIEUtils.cpp" "AMDAIEVectorization.cpp" "BridgeToAIRPass.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 481687e2c..62a6bb3f6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -84,6 +84,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIETEMPORARYALLOCBUFFERIZATION #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE +#define GEN_PASS_DEF_AMDAIEASSIGNTILESTOOBJECTFIFO #define GEN_PASS_DEF_AMDAIEVECTORIZATION #include "iree-amd-aie/Transforms/Passes.h.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 20131a39c..6ffd42025 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -598,6 +598,7 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createCSEPass()); passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass()); + passManager.addPass(createAMDAIEAssignTilesToObjectFifoPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 69868b57f..b2eeb019f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -289,6 +289,9 @@ std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); std::unique_ptr createAMDAIETileAndFusePass( AMDAIETileAndFuseOptions options = {}); +/// Create pass to TODO(newling) +std::unique_ptr createAMDAIEAssignTilesToObjectFifoPass(); + /// Create pass to propagate pack/unpack ops using upstream patterns. std::unique_ptr createAMDAIEPropagateDataLayoutPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 4a6ec2afe..340cb4a9a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -10,6 +10,12 @@ include "iree-amd-aie/IR/AMDAIEDialect.td" include "mlir/Pass/PassBase.td" +def AMDAIEAssignTilesToObjectFifo : + Pass<"iree-amdaie-assign-tiles-to-objectfifo", "ModuleOp"> { + let summary = "TODO"; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignTilesToObjectFifoPass()"; +} + def AMDAIEAccessToAcquireRelease : Pass<"iree-amdaie-access-to-acquire-release", ""> { let summary = "Convert logical objectFifo access operations to acquire/release " @@ -332,7 +338,7 @@ def AMDAIEInsertInfiniteLoopAroundCoreBlock : This pass is meant for developers to allow retrieval of granular performance statistics and is not meant to be enabled by default. With this pass enabled, you can put a loop around the hardware queue command submission, for example: - + ``` for (int i = 0; i < N; i++) { ebuf.m_cmd_pkt->state = ERT_CMD_STATE_NEW; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 6cf5c3a12..aab8f9b7d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -80,6 +80,7 @@ iree_lit_test_suite( "tile_and_fuse_matmul_using_scf_forall.mlir" "tile_and_fuse_convolution_using_scf_forall.mlir" "tile_copy_using_scf_for.mlir" + "assign_tiles_to_objectfifo.mlir" "vectorization.mlir" TOOLS ${IREE_LLD_TARGET} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles_to_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles_to_objectfifo.mlir new file mode 100644 index 000000000..cdbb91f24 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles_to_objectfifo.mlir @@ -0,0 +1,38 @@ +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-assign-tiles-to-objectfifo,cse)" %s | FileCheck %s + +// TODO(newling) this test file is currently very small, as the pass it is +// testing was originally part of distribte-cores-and-objectfifos. Much of +// its functionality is therefore still tested in the file +// distribute_cores_and_objectfifos.mlir. The testing should be moved to here. + +// CHECK-LABEL: @basic_case_0 +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]] +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]] +// CHECK-DAG: logicalobjectfifo.from_memref{{.*}}{%[[TILE_1_1]]} : memref<32x1024xi32, 1> -> +// CHECK-DAG: logicalobjectfifo.from_memref{{.*}}{%[[TILE_1_2]]} : memref<32x64xi32, 2> -> + +// A case where there is a core on tile (col=1, row=2) which has a copy +// from L2 (memoryspace '1') to L1 (memory space '2') peformed by a dma_cpy_nd +// operation. +module { + func.func @basic_case_0() { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + // TODO(newling) making these function arguments results in segfault, shouldn't. + %alloc = memref.alloc() : memref<32x1024xi32, 1> + %alloc_0 = memref.alloc() : memref<32x64xi32, 2> + %tile = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[0, 0] [0, 0] [0, 0]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile, in : [%2], out : []) { + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<32x64xi32, 2> + amdaie.end + } + memref.dealloc %alloc_0 : memref<32x64xi32, 2> + memref.dealloc %alloc : memref<32x1024xi32, 1> + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index 3a9e583bc..5830a0b76 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -1,4 +1,9 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,iree-amdaie-assign-tiles-to-objectfifo,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// TODO(newling) make this just test iree-amdaie-distribute-cores-and-objectfifos +// ie remove iree-amdaie-assign-tiles-to-objectfifo from the pass pipeline. +// Why are the 2 passes currently tested together? Because the 2 passes used +// to be a single pass, and the testing has not been updated. // Check for unrolling an amdaie.core within a parallel loop with a single // induction variable with multiple iterations. There are no dma ops in this