From faf94d89cf312040032ca8a77e234c921648f415 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Thu, 21 Nov 2024 04:29:30 -0800 Subject: [PATCH 1/2] Add pass to assign tiles to logical objectFifo --- ..._elementwise_pack_peel_objectfifo_e2e.mlir | 2 +- .../samples/matmul_pack_peel_objectfifo.mlir | 4 +- .../matmul_pack_peel_objectfifo_e2e.mlir | 2 +- ...tmul_pack_peel_objectfifo_ukernel_e2e.mlir | 4 +- .../Transforms/AMDAIEAssignTiles.cpp | 437 ++++++++++++++++++ .../AMDAIEDistributeCoresAndObjectFifos.cpp | 300 +----------- .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 4 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 5 + .../iree-amd-aie/Transforms/Transforms.h | 9 + .../Transforms/test/CMakeLists.txt | 1 + .../Transforms/test/assign_tiles.mlir | 360 +++++++++++++++ .../distribute_cores_and_objectfifos.mlir | 131 ++++-- .../aie_runtime/iree_aie_runtime.cc | 21 + .../aie_runtime/iree_aie_runtime.h | 12 +- 17 files changed, 963 insertions(+), 334 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir index 6130aa54b..e0eacc703 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir @@ -13,7 +13,7 @@ // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) // CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index 429089cdd..7b0b58026 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s @@ -20,7 +20,7 @@ // CHECK: aie.use_lock // Check a bit of the aiex.runtime_sequence: // CHECK: aiex.runtime_sequence @matmul_i32() -// CHECK: } {npu_instructions = dense_resource : tensor<174xui32>, runtime_sequence_name = "matmul_i32"} +// CHECK: } {npu_instructions = dense_resource : tensor<208xui32>, runtime_sequence_name = "matmul_i32"} #pipeline_layout = #hal.pipeline.layout, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir index 9229da0c3..b69322068 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir @@ -14,7 +14,7 @@ // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) // CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir index 326a178e5..210b1ce99 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir @@ -15,7 +15,7 @@ // PHOENIX-DAG: aie.core(%[[TILE_0_3]]) // PHOENIX-DAG: aie.core(%[[TILE_1_3]]) // PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // PHOENIX-DAG: aie.memtile_dma(%[[TILE_0_1]]) // PHOENIX-DAG: aie.mem(%[[TILE_0_2]]) // PHOENIX-DAG: aie.mem(%[[TILE_0_3]]) @@ -39,7 +39,7 @@ // STRIX-DAG: aie.core(%[[TILE_0_3]]) // STRIX-DAG: aie.core(%[[TILE_1_3]]) // STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // STRIX-DAG: aie.memtile_dma(%[[TILE_0_1]]) // STRIX-DAG: aie.mem(%[[TILE_0_2]]) // STRIX-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp new file mode 100644 index 000000000..cc54e481b --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp @@ -0,0 +1,437 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelGenerator.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-assign-tiles" + +namespace mlir::iree_compiler::AMDAIE { + +/// Return the tiles of the sources respectively targets of the users of this +/// logical objectfifo, depending on whether the OperateOn template parameter is +/// set to `OperateOn::Source` respectively `OperateOn::Target`. +template +LogicalResult getUserTiles( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + SmallVectorImpl &tiles) { + llvm::SmallSetVector tileSet; + for (Operation *user : logicalObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(user)) { + ValueRange tileIndices; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getSourceObjectFifo().getTiles(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getTargetObjectFifo().getTiles(); + } + // Only fill in tiles when all sources have tiles. + if (tileIndices.empty()) return failure(); + for (Value index : tileIndices) { + tileSet.insert( + dyn_cast_if_present(index.getDefiningOp())); + } + } + } + tiles = tileSet.takeVector(); + return success(); +} + +/// Utility to recursively find users of the provided logical objectFifo inside +/// `amdaie.core` operations and return the tile coordinates. +LogicalResult findUsersInCoreAndAddTiles( + Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + llvm::SmallSetVector, 16> &tiles) { + for (Operation *userOp : op->getUsers()) { + if (auto coreOp = userOp->getParentOfType()) { + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) + return coreOp.emitOpError() << "has non-constant tile location"; + tiles.insert(std::make_pair(column.value(), row.value())); + } + if (auto subviewOp = dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); + } else if (auto userLogicalObjectFifo = + dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(userLogicalObjectFifo, + logicalObjectFifo, tiles); + } + } + return success(); +} + +/// Utility to clear non-local tile assignments. +LogicalResult clearNonLocalTiles(RewriterBase &rewriter, Operation *op) { + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp objFifo) { + if (objFifo.getMemorySpaceAsUInt() != 2) { + rewriter.setInsertionPoint(objFifo); + SmallVector tiles; + rewriter.replaceOpWithNewOp( + objFifo, cast(objFifo.getOutput().getType()), + objFifo.getMemref(), tiles); + } + }); + return success(); +} + +/// Utility to duplicate global objFifos for each strided copy-like operation +/// user to allow global logical objectFifos to be assigned to different tile +/// locations. +LogicalResult duplicateGlobalObjFifos(RewriterBase &rewriter, Operation *op) { + op->walk([&](AMDAIE::DoublyStridedCopyOpInterface copyOp) { + auto source = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto target = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + if (source && source.getMemorySpaceAsUInt() == 0) { + rewriter.setInsertionPoint(copyOp); + auto newSource = rewriter.create( + rewriter.getUnknownLoc(), + cast(source.getOutput().getType()), + source.getMemref()); + rewriter.replaceUsesWithIf( + source.getOutput(), newSource.getOutput(), [&](OpOperand &use) { + return use.getOwner() == copyOp.getOperation(); + }); + } + if (target && target.getMemorySpaceAsUInt() == 0) { + rewriter.setInsertionPoint(copyOp); + auto newTarget = rewriter.create( + rewriter.getUnknownLoc(), + cast(target.getOutput().getType()), + target.getMemref()); + rewriter.replaceUsesWithIf( + target.getOutput(), newTarget.getOutput(), [&](OpOperand &use) { + return use.getOwner() == copyOp.getOperation(); + }); + } + }); + return success(); +} + +/// Assign tiles to the logical objectfifos with local memory space (L1). +/// The tiles are derived from the usage of the logical objectfifos within +/// core operations, which are already assigned a tile location. +LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op) { + WalkResult res = + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 2) + return WalkResult::advance(); + + llvm::SmallSetVector, 16> tileLocations; + if (failed(findUsersInCoreAndAddTiles( + logicalObjectFifo, logicalObjectFifo, tileLocations))) { + return WalkResult::interrupt(); + } + // Handle subviews. + for (Operation *userOp : + logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { + if (auto subviewOp = dyn_cast(userOp)) { + if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, + tileLocations))) { + return WalkResult::interrupt(); + } + } + } + + SmallVector tiles; + tiles.reserve(tileLocations.size()); + rewriter.setInsertionPoint(logicalObjectFifo); + for (auto [column, row] : tileLocations) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), row); + auto tileOp = rewriter.create( + rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // Sort for deterministic output IR. + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tiles); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +/// Assign a set of candidate physical AIE tiles to logical objectFifos. This +/// rewrite takes an iterative approach by matching logical objectfifos and only +/// assigning tiles when linked through dma ops with other logical objectfifos +/// which already have tiles assigned. If the linked logical objectfifos don't +/// have tiles assigned yet, we will return a failure and give the linked +/// logical objectfifos a chance to assign tiles before returning to this one. +class FillTiles + : public OpRewritePattern { + using OpRewritePattern< + AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; + + public: + FillTiles(MLIRContext *context, const AMDAIE::AMDAIEDeviceModel &deviceModel) + : OpRewritePattern(context), deviceModel(deviceModel) {} + + LogicalResult matchAndRewrite( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + PatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "FillTiles: " << logicalObjectFifo << "\n"); + if (!logicalObjectFifo.getTiles().empty()) { + return rewriter.notifyMatchFailure(logicalObjectFifo, + "Tiles are already assigned."); + } + uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); + if (memSpace != 0 && memSpace != 1) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Skip logical objFifos that don't operate on L3 or L2"); + } + + SmallVector targetTiles; + SmallVector sourceTiles; + LogicalResult dstRes = + getUserTiles(logicalObjectFifo, targetTiles); + LogicalResult srcRes = + getUserTiles(logicalObjectFifo, sourceTiles); + if (failed(dstRes) && failed(srcRes)) { + return rewriter.notifyMatchFailure(logicalObjectFifo, + "No source or target tiles found"); + } + + SmallVector memSpaceRows = deviceModel.getMemSpaceRows(memSpace); + if (memSpaceRows.size() == 0) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "No rows found for the memory space of this logical objFifo"); + } + if (memSpaceRows.size() > 1) { + logicalObjectFifo.emitWarning() + << "has a memory space with multiple available rows, the first one " + "of which is chosen for tile assignment, but this might not lead " + "to good usage of the available resources."; + } + uint32_t row = memSpaceRows[0]; + llvm::SmallSetVector, 16> tileLocations; + auto createTileLocations = + [&](SmallVector &tiles) -> LogicalResult { + // For deterministic and canonical output, sort on column index and erase + // duplicates. + std::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); + for (AMDAIE::TileOp tile : tiles) { + std::optional column = getConstantIntValue(tile.getCol()); + if (!column) return tile.emitOpError() << "found non-constant column"; + tileLocations.insert(std::make_pair(column.value(), row)); + } + return success(); + }; + + if (!targetTiles.empty() && !sourceTiles.empty()) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Found logical objectfifo with both source and target tiles, which " + "is not supported yet"); + } else if (!targetTiles.empty()) { + // Create tile locations for this logical objectfifo based on the + // consumers' tiles. + if (failed(createTileLocations(targetTiles))) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Could not find tile locations based on the consumers' tiles."); + } + } else if (!sourceTiles.empty()) { + // Create tile locations for this logical objectfifo based on producers' + // tiles. + if (failed(createTileLocations(sourceTiles))) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Could not find tile locations based on the producers' tiles."); + } + } else { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Don't assign this logicalObjectFifo to a physical tile (yet!). Wait " + "for other logical objectfifos to be assigned first."); + } + + if (tileLocations.empty()) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "No tile locations found for this logical objFifo. Maybe in a next " + "iteration, with more information, a tile location can be found."); + } + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, logicalObjectFifo.getMemref(), + tileLocations.takeVector()); + return success(); + } + + private: + // The device model used to retrieve device specific information. + const AMDAIEDeviceModel &deviceModel; +}; + +/// Assign tile locations to objectFifos. Start by searching for a set of +/// candidate tile locations and then assign tiles based on a simple usage-based +/// model that prioritizes tiles that have the least usage. +LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, + const AMDAIEDeviceModel &deviceModel) { + MLIRContext *context = rewriter.getContext(); + if (failed(clearNonLocalTiles(rewriter, op))) + return op->emitOpError() << "failed to clear non-local tile assignemts"; + + // Find and fill the tile candidates. + RewritePatternSet fillTilePatterns(context); + fillTilePatterns.insert(context, deviceModel); + if (failed(applyPatternsAndFoldGreedily(op, std::move(fillTilePatterns)))) { + return op->emitOpError() + << "collection of tile candidates for logical objectFifos failed"; + } + if (failed(verify(op, true))) { + return failure(); + } + LLVM_DEBUG(llvm::dbgs() << "After fillTiles: \n" << *op << "\n"); + + // Keep track of the buffer usage on tiles to try distributing buffers equally + // over available tiles. + DenseMap tileLocToUsage; + auto tileLocAndUsageCmp = [&](AMDAIE::TileOp a, AMDAIE::TileOp b) -> bool { + int64_t colA = getConstantIndexOrAssert(a.getCol()); + int64_t rowA = getConstantIndexOrAssert(a.getRow()); + int64_t colB = getConstantIndexOrAssert(b.getCol()); + int64_t rowB = getConstantIndexOrAssert(b.getRow()); + size_t usageA = tileLocToUsage[TileLoc(colA, rowA)]; + size_t usageB = tileLocToUsage[TileLoc(colB, rowB)]; + if (usageA < usageB) return true; + if (usageA > usageB) return false; + if (colA < colB) return true; + if (colA > colB) return false; + if (rowA < rowB) return true; + if (rowA > rowB) return false; + assert(false && "same tiles should never be compared"); + }; + + // After filling tile candidates, find and assign a specific one. + DenseMap logicalObjFifoToTileId; + WalkResult res = + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); + if (memSpace != 0 && memSpace != 1) return WalkResult::advance(); + if (logicalObjectFifo.getTiles().size() == 0) { + logicalObjectFifo.emitOpError() + << "should have at least one tile candidate"; + return WalkResult::interrupt(); + } + + SmallVector tiles = + llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { + return dyn_cast_if_present(tile.getDefiningOp()); + }); + AMDAIE::TileOp assignedTileOp = + *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp); + + // Increase usage of the chosen tile. + int64_t col = getConstantIndexOrAssert(assignedTileOp.getCol()); + int64_t row = getConstantIndexOrAssert(assignedTileOp.getRow()); + tileLocToUsage[TileLoc(col, row)] += 1; + + rewriter.setInsertionPoint(logicalObjectFifo); + SmallVector tileResults = { + cast(assignedTileOp.getResult())}; + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +namespace { + +class AMDAIEAssignTilesPass + : public impl::AMDAIEAssignTilesBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override; +}; + +void AMDAIEAssignTilesPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(&getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine when loops " + "can be subsumed into DMA operations, and must be attached to a " + "containing ModuleOp."; + return signalPassFailure(); + } + AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); + + // Assign tile locations to logical objectfifos on local (L1) memory. + if (failed(assignLocalTiles(rewriter, parentOp))) { + parentOp->emitOpError() << "local tile assignment failed"; + return signalPassFailure(); + } + if (failed(verify(parentOp, true))) { + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After assignLocalTiles: \n" << *parentOp << "\n"); + + // Duplicate global objFifos for each strided copy-like operation user to + // allow global logical objectFifos to be assigned to different tile + // locations. + if (failed(duplicateGlobalObjFifos(rewriter, parentOp))) { + parentOp->emitOpError() << "failed duplicating global object fifos"; + return signalPassFailure(); + } + if (failed(verify(parentOp, true))) { + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After duplicateGlobalObjFifos: \n" + << *parentOp << "\n"); + + // Assign tile locations to logical objectfifos on non-local (not L1) memory. + if (failed(assignNonLocalTiles(rewriter, parentOp, deviceModel))) { + parentOp->emitOpError() << "local tile assignment failed"; + return signalPassFailure(); + } + if (failed(verify(parentOp, true))) { + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After assignNonLocalTiles: \n" + << *parentOp << "\n"); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignTilesPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index dbd439458..750fcca7b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/Transforms/Transforms.h" #include "llvm/Support/Debug.h" @@ -323,37 +324,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern { } }; -/// Return the tiles of the sources respectively targets of the users of this -/// logical objectfifo, depending on whether the OperateOn template parameter is -/// set to `OperateOn::Source` respectively `OperateOn::Target`. -template -LogicalResult getUserTiles( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - SmallVectorImpl &tiles) { - llvm::SmallSetVector tileSet; - for (Operation *user : logicalObjectFifo->getUsers()) { - if (auto dmaOp = dyn_cast(user)) { - ValueRange tileIndices; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getSourceObjectFifo().getTiles(); - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getTargetObjectFifo().getTiles(); - } - - // Only fill in tiles when all sources have tiles. - if (tileIndices.empty()) return failure(); - for (Value index : tileIndices) { - tileSet.insert( - dyn_cast_if_present(index.getDefiningOp())); - } - } - } - tiles = tileSet.takeVector(); - return success(); -} - /// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the /// memrefs from logical objectfifos and update the computational operations to /// operate on these local memrefs. @@ -454,229 +424,6 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { return success(); } -/// Utility to recursively find users of the provided logical objectFifo inside -/// `amdaie.core` operations and return the tile coordinates. -LogicalResult findUsersInCoreAndAddTiles( - Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - llvm::SmallSetVector, 16> &tiles) { - for (Operation *userOp : op->getUsers()) { - if (auto coreOp = userOp->getParentOfType()) { - AMDAIE::TileOp tileOp = coreOp.getTileOp(); - std::optional column = getConstantIntValue(tileOp.getCol()); - std::optional row = getConstantIntValue(tileOp.getRow()); - if (!column || !row) { - return coreOp.emitOpError() << "has non-constant tile location"; - } - tiles.insert(std::make_pair(column.value(), row.value())); - } - if (auto subviewOp = dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); - } else if (auto userLogicalObjectFifo = - dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(userLogicalObjectFifo, - logicalObjectFifo, tiles); - } - } - return success(); -} - -/// Assign tiles to the logical objectfifos with local memory space (L1). -/// The tiles are derived from the usage of the logical objectfifos within -/// core operations, which are already assigned a tile location. -LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - WalkResult res = moduleOp->walk( - [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (!memSpace || dyn_cast(memSpace).getInt() != 2) - return WalkResult::advance(); - - llvm::SmallSetVector, 16> tileLocations; - if (failed(findUsersInCoreAndAddTiles( - logicalObjectFifo, logicalObjectFifo, tileLocations))) { - return WalkResult::interrupt(); - } - // Handle subviews. - for (Operation *userOp : - logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { - if (auto subviewOp = dyn_cast(userOp)) { - if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, - tileLocations))) { - return WalkResult::interrupt(); - } - } - } - - SmallVector tiles; - tiles.reserve(tileLocations.size()); - rewriter.setInsertionPoint(logicalObjectFifo); - for (auto [column, row] : tileLocations) { - auto colIndex = rewriter.create( - rewriter.getUnknownLoc(), column); - auto rowIndex = rewriter.create( - rewriter.getUnknownLoc(), row); - auto tileOp = rewriter.create( - rewriter.getUnknownLoc(), colIndex, rowIndex); - tiles.push_back(tileOp.getResult()); - } - // Sort for deterministic output IR. - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileValueColumnAndRowComparator); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast( - logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tiles); - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - -/// Assign a set of potential physical AIE tiles to logical objectFifos. This -/// rewrite takes an iterative approach by matching logical objectfifos and only -/// assigning tiles when linked through dma ops with other logical objectfifos -/// which already have tiles assigned. If the linked logical objectfifos don't -/// have tiles assigned yet, we will return a failure and give the linked -/// logical objectfifos a chance to assign tiles before returning to this one. -/// -/// TODO(jornt): There are decisions being made in this pass on which tiles to -/// assign to a logical objectfifo. This logic is very simple for now and tries -/// to use the tiles in the same columns as targets and sources. At some point, -/// we probably need some AIE device model to guide the assignement here for -/// performance and to avoid hardware resource issues later on. -class FillAieTiles - : public OpRewritePattern { - using OpRewritePattern< - AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; - - LogicalResult matchAndRewrite( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - PatternRewriter &rewriter) const override { - LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n"); - if (!logicalObjectFifo.getTiles().empty()) { - return failure(); - } - - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - // Skip logical objectfifos within local memory as they should already be - // assigned. - if (memSpace && dyn_cast(memSpace).getInt() == 2) { - if (logicalObjectFifo.getTiles().empty()) { - logicalObjectFifo.emitOpError() - << "found logical objectfifo on local memory space with no tiles " - "assigned."; - } - return failure(); - } - // HandLe both L3/shim and L2/Memtiles. - // Skip logical objectfifos within non-global and non-shared memory. - if (memSpace && dyn_cast(memSpace).getInt() != 1) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with unknown memory space"; - } - - SmallVector targetTiles; - SmallVector sourceTiles; - LogicalResult dstRes = - getUserTiles(logicalObjectFifo, targetTiles); - LogicalResult srcRes = - getUserTiles(logicalObjectFifo, sourceTiles); - - // If no source and target tiles found, skip. - if (failed(dstRes) && failed(srcRes)) { - return failure(); - } - - // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie - // target model for this. - int64_t rowInt = memSpace ? 1 : 0; - llvm::SmallSetVector, 16> tileLocations; - auto createTileLocations = - [&](SmallVector &tiles) -> LogicalResult { - // TODO(jornt): For now, for deterministic behaviour, sort on column - // index and use first one. This needs to be generalized to assign - // tiles based on a resource model. - std::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - // Erase duplicates. - tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); - for (AMDAIE::TileOp tile : tiles) { - std::optional column = getConstantIntValue(tile.getCol()); - if (!column) return tile.emitOpError() << "found non-constant column"; - tileLocations.insert(std::make_pair(column.value(), rowInt)); - } - return success(); - }; - - if (!targetTiles.empty() && !sourceTiles.empty()) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with both source and target tiles, " - "which is not supported yet"; - } else if (!targetTiles.empty()) { - // Create tile locations for this logical objectfifo based on target - // tiles. - if (failed(createTileLocations(targetTiles))) { - return failure(); - } - } else if (!sourceTiles.empty()) { - // Create tile locations for this logical objectfifo based on source - // tiles. - if (failed(createTileLocations(sourceTiles))) { - return failure(); - } - } else { - // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait - // for other logical objectfifos to be assigned first. - return failure(); - } - - // If no tile results, skip, and maybe in a next iteration another tile will - // be found. - if (tileLocations.empty()) { - return failure(); - } - - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, logicalObjectFifo.getMemref(), - tileLocations.takeVector()); - return success(); - } -}; - -/// Assign specific tile locations to objectFifos, starting from the set of -/// potential tile locations filled in earlier. -LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (memSpace && dyn_cast(memSpace).getInt() != 1) - return WalkResult::advance(); - - SmallVector tiles = - llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { - return dyn_cast_if_present(tile.getDefiningOp()); - }); - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - - // For now, use first tile in sorted list. - // TODO(jornt): This will need to become more complex in the future to - // account for potential hardware limitations and constraints. - SmallVector tileResults = {cast(tiles[0].getResult())}; - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast(logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tileResults); - return WalkResult::advance(); - }); - return success(); -} - class AMDAIEDistributeCoresAndObjectFifosPass : public impl::AMDAIEDistributeCoresAndObjectFifosBase< AMDAIEDistributeCoresAndObjectFifosPass> { @@ -694,6 +441,18 @@ class AMDAIEDistributeCoresAndObjectFifosPass void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { MLIRContext *context = &getContext(); ModuleOp moduleOp = getOperation(); + IRRewriter rewriter(moduleOp.getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(moduleOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + moduleOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine when loops " + "can be subsumed into DMA operations, and must be attached to a " + "containing ModuleOp."; + return signalPassFailure(); + } + AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); // Convert local scf.forall operations selected for parallel distribution to // nested scf.for operations. @@ -750,7 +509,7 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { << moduleOp << "\n"); // Assign tile locations to logical objectfifos on local (L1) memory. - if (failed(assignLocalAieTiles(moduleOp))) { + if (failed(assignLocalTiles(rewriter, moduleOp))) { moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } @@ -759,40 +518,21 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { return signalPassFailure(); } - LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n" + LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalTiles: \n" << moduleOp << "\n"); - // Assign a set of potential tile locations to the remaining logical - // objectFifos. - RewritePatternSet assignAieTilePatters(context); - assignAieTilePatters.insert(context); - if (failed(applyPatternsAndFoldGreedily(moduleOp, - std::move(assignAieTilePatters)))) { - moduleOp.emitOpError() - << "collection of tile candidates for logical objectFifos failed"; + // Assign tile locations to logical objectfifos on non-local (not L1) memory. + if (failed(assignNonLocalTiles(rewriter, moduleOp, deviceModel))) { + moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } if (failed(verify(moduleOp, true))) { return signalPassFailure(); } - LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n" - << moduleOp << "\n"); - - // Assign specific tile locations to objectFifos, starting from the set of - // potential tile locations filled in earlier. - if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) { - moduleOp.emitOpError() - << "tile assignment and logical objectFifo distribution failed"; - return signalPassFailure(); - } - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() - << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n" - << moduleOp << "\n"); + LLVM_DEBUG(llvm::dbgs() << "Module after assignNonLocalTiles: \n" + << moduleOp << "\n"); } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index ae96b35ca..1f4d2d6de 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -52,6 +52,7 @@ iree_cc_library( "AMDAIEAssignLogicalObjectFifoDepth.cpp" "AMDAIEAssignNpuDmaBdIds.cpp" "AMDAIEAssignPacketIds.cpp" + "AMDAIEAssignTiles.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 45a770a62..468d61577 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -27,6 +27,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS #define GEN_PASS_DEF_AMDAIEASSIGNPACKETIDS +#define GEN_PASS_DEF_AMDAIEASSIGNTILES #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 0edd735d4..634571caf 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -605,6 +605,10 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEAssignTilesPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); passManager.addPass(createCSEPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 399d0ac19..52242bbb8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -93,6 +93,9 @@ std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass(); /// Create a pass to assign packet ids to `amdaie.flow` operations. std::unique_ptr createAMDAIEAssignPacketIdsPass(); +/// Create a pass to assign physical tile locations to logical objFifos. +std::unique_ptr createAMDAIEAssignTilesPass(); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 6666c8d6e..8fcdc767c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -67,6 +67,11 @@ def AMDAIEAssignPacketIds : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignPacketIdsPass()"; } +def AMDAIEAssignTiles : Pass<"iree-amdaie-assign-tiles", ""> { + let summary = "Assign physical tile locations to logical objFifos."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignTilesPass()"; +} + def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> { let summary = "Perform transformations that allow hooking into AIR/AIE lowering"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h index 10d444584..ca56f4446 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h @@ -14,6 +14,15 @@ namespace mlir::iree_compiler::AMDAIE { +/// Assign tile locations to the logical objectfifos with local memory space +/// (L1). +LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op); + +/// Assign tile locations to the logical objectfifos with non-local memory space +/// (L2, L3 etc, not L1). +LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, + const AMDAIEDeviceModel &deviceModel); + /// Unroll the loops within the control code regions. LogicalResult controlCodeLoopUnroll(RewriterBase &rewriter, AMDAIE::ControlCodeOp controlCodeOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index c0d2bf00f..151812b33 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -16,6 +16,7 @@ iree_lit_test_suite( "assign_logical_objectfifo_depth.mlir" "assign_npu_dma_bd_ids.mlir" "assign_packet_ids.mlir" + "assign_tiles.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_doubly_strided_op.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir new file mode 100644 index 000000000..ca6ee9b93 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir @@ -0,0 +1,360 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-tiles,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// Test assignment of L1 objFifos based on the cores where they are used. +// CHECK-LABEL: @assign_local_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 2> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_local_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %alloc = memref.alloc() : memref<1024xi32, 2> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.core(%tile_0_2, in : [], out : []) { + %3 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %5 = amdaie.core(%tile_0_3, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32, 2> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L2 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l2_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l2_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %alloc = memref.alloc() : memref<2048xi32, 1> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_0_2, in : [], out : []) { + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %5 = amdaie.core(%tile_0_3, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32, 1> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L2 objFifos onto different columns. +// CHECK-LABEL: @assign_l2_tiles_on_diff_cols +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 1> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l2_tiles_on_diff_cols() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1024xi32, 1> + %alloc_1 = memref.alloc() : memref<2048xi32, 1> + %alloc_2 = memref.alloc() : memref<1024xi32, 2> + %alloc_3 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.dma_cpy_nd(%3[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.core(%tile_0_2, in : [], out : []) { + %7 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + amdaie.end + } + %8 = amdaie.core(%tile_1_2, in : [], out : []) { + %9 = amdaie.logicalobjectfifo.access(%3, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32, 1> + memref.dealloc %alloc_1 : memref<2048xi32, 1> + memref.dealloc %alloc_2 : memref<1024xi32, 2> + memref.dealloc %alloc_3 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 and L2 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l3_l2_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_l2_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 1> + %alloc_2 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.core(%tile_0_2, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 1> + memref.dealloc %alloc_2 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l3_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_0_2, in : [], out : []) { + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 objFifos onto different columns. +// CHECK-LABEL: @assign_l3_tiles_on_diff_cols +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_0]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_tiles_on_diff_cols() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1024xi32> + %alloc_1 = memref.alloc() : memref<2048xi32> + %alloc_2 = memref.alloc() : memref<1024xi32, 1> + %alloc_3 = memref.alloc() : memref<2048xi32, 1> + %alloc_4 = memref.alloc() : memref<1024xi32, 2> + %alloc_5 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%4[] [] [], %3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%5[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.core(%tile_0_2, in : [], out : []) { + %11 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + amdaie.end + } + %12 = amdaie.core(%tile_1_2, in : [], out : []) { + %13 = amdaie.logicalobjectfifo.access(%5, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32> + memref.dealloc %alloc_1 : memref<2048xi32> + memref.dealloc %alloc_2 : memref<1024xi32, 1> + memref.dealloc %alloc_3 : memref<2048xi32, 1> + memref.dealloc %alloc_4 : memref<1024xi32, 2> + memref.dealloc %alloc_5 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test duplicate global logical objFifos. +// CHECK-LABEL: @duplicate_global_object_fifos +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @duplicate_global_object_fifos() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + %alloc_2 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%2[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.core(%tile_0_2, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %7 = amdaie.core(%tile_1_2, in : [], out : []) { + %8 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + memref.dealloc %alloc_2 : memref<2048xi32, 2> + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index 3a9e583bc..57a15c673 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -1,5 +1,19 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + // Check for unrolling an amdaie.core within a parallel loop with a single // induction variable with multiple iterations. There are no dma ops in this // check. @@ -18,7 +32,8 @@ // CHECK: %{{.*}} = amdaie.core(%[[TILE_2]], in : [], out : []) // CHECK: %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]]) // CHECK: %{{.*}} = amdaie.core(%[[TILE_3]], in : [], out : []) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_1x4() { %c2 = arith.constant 2 : index scf.forall (%arg0, %arg1) in (1, 1) { @@ -50,7 +65,8 @@ module { // CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]], in : [], out : []) // CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) // CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]], in : [], out : []) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_2x2() { scf.forall (%arg0, %arg1) in (1, 1) { scf.forall (%arg2, %arg3) in (2, 2) { @@ -92,7 +108,8 @@ module { // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : []) // CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @unroll_dma() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -142,7 +159,8 @@ module { // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_single_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -196,7 +214,8 @@ module { // CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_and_affine_single_loop_2x1() { %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<32x1024xi32, 1> @@ -251,7 +270,8 @@ module { // CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @unroll_dma_and_affine_single_loop() { %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<32x1024xi32, 1> @@ -308,7 +328,8 @@ module { // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_multi_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -367,7 +388,8 @@ module { // CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_one_of_multi_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -440,7 +462,8 @@ module { // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_3]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -491,6 +514,8 @@ module { // CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} // CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_1_3]]} @@ -499,8 +524,8 @@ module { // CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]]} // CHECK-DAG: %[[FROM_MEMREF_6:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_3]]} // CHECK-DAG: %[[FROM_MEMREF_7:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_3]]} -// CHECK-DAG: %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_1_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]] // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]] // CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]] @@ -529,7 +554,8 @@ module { // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @nested_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 %c1 = arith.constant 1 : index @@ -589,23 +615,26 @@ module { // CHECK: linalg.fill ins(%[[C0]] : i32) outs(%[[ACCESS]] : memref<1x1x8x8x4x4xi32, 2 : i32>) // CHECK: amdaie.end // CHECK: memref.dealloc %[[ALLOC]] : -func.func @l1_temporary_buffer_for_matmul_elem() { - %c0_i32 = arith.constant 0 : i32 - %c2 = arith.constant 2 : index - %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> - scf.forall (%arg0, %arg1) in (1, 1) { - scf.forall (%arg2, %arg3) in (1, 1) { - %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> - %26 = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %26) - %27 = amdaie.core(%tile, in : [], out : []) { - linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> - return +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @l1_temporary_buffer_for_matmul_elem() { + %c0_i32 = arith.constant 0 : i32 + %c2 = arith.constant 2 : index + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + scf.forall (%arg0, %arg1) in (1, 1) { + scf.forall (%arg2, %arg3) in (1, 1) { + %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %26) + %27 = amdaie.core(%tile, in : [], out : []) { + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) + amdaie.end + } + } {mapping = [#gpu.thread, #gpu.thread]} + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> + return + } } // ----- @@ -618,19 +647,22 @@ func.func @l1_temporary_buffer_for_matmul_elem() { // CHECK-SAME: to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> // CHECK-NOT: memref.subview // CHECK: return -func.func @not_distributable() { - %cst = arith.constant 0.000000e+00 : bf16 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<2x2x100xbf16, 2> - scf.forall (%arg0, %arg1) in (2, 2) { - scf.for %arg2 = %c0 to %c4 step %c1 { - %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> - linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>) - } - } {mapping = [#gpu.thread, #gpu.thread]} - return +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @not_distributable() { + %cst = arith.constant 0.000000e+00 : bf16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %alloc = memref.alloc() : memref<2x2x100xbf16, 2> + scf.forall (%arg0, %arg1) in (2, 2) { + scf.for %arg2 = %c0 to %c4 step %c1 { + %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> + linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>) + } + } {mapping = [#gpu.thread, #gpu.thread]} + return + } } @@ -652,6 +684,7 @@ func.func @not_distributable() { // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%c0, %c1) // CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%c1, %c1) // CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%c0, %c0) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%c1, %c0) // CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_1]]} // CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_1]]} @@ -663,7 +696,7 @@ func.func @not_distributable() { // CHECK-DAG: %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]} // CHECK-DAG: %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]} // CHECK-DAG: %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]} -// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_1_0]]} // CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]] // CHECK-SAME: %[[FROM_MEMREF_11]] @@ -702,7 +735,8 @@ func.func @not_distributable() { #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> #map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos() { %c2 = arith.constant 2 : index %c1024 = arith.constant 1024 : index @@ -818,7 +852,8 @@ module { // CHECK-DAG: vector.transfer_write %[[CONTRACT]], %[[VAL_2]] // CHECK-DAG-SAME: [%[[C0]], %[[C0]], %[[ARG3]], %[[ARG2]], %[[C0]], %[[C0]]] // CHECK-DAG-SAME: in_bounds = [true, true, true, true, true, true] -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_vectorization() { %c192 = arith.constant 192 : index %c32 = arith.constant 32 : index @@ -918,7 +953,8 @@ module { // CHECK-DAG: func.call @matmul_i32_i32 // CHECK-DAG: amdaie.end // CHECK-DAG: } {elf_file = "/path/to/ukernel.o"} -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func private @matmul_i32_i32(memref, index, memref, index, memref, index) attributes {link_with = "/path/to/ukernels.o", llvm.bareptr = true} func.func @distribute_cores_and_objectfifos_ukernel() { %c64 = arith.constant 64 : index @@ -987,7 +1023,8 @@ module { // CHECK-SAME: ins(%[[ACCESS_1]] : memref<4x4xi32, 2 : i32>) outs(%[[SUBVIEW:.*]] : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) { #map = affine_map<(d0, d1) -> (d0, d1)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @mixed_alloc_subview_operands() { %c2 = arith.constant 2 : index %c0_i32 = arith.constant 0 : i32 diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index 0af56fd20..a29fe0898 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -318,6 +318,22 @@ uint32_t AMDAIEDeviceModel::getMemTileSize(uint8_t col, uint8_t row) const { return devInst.DevProp.DevMod[static_cast(tileType)].MemMod->Size; } +SmallVector AMDAIEDeviceModel::getMemSpaceRows( + uint8_t memSpace) const { + SmallVector res; + if (memSpace == 0) { + res.resize(deviceConfig.shimTileNumRows); + std::iota(res.begin(), res.end(), configPtr.ShimRowNum); + } else if (memSpace == 1) { + res.resize(configPtr.MemTileNumRows); + std::iota(res.begin(), res.end(), configPtr.MemTileRowStart); + } else if (memSpace == 2) { + res.resize(configPtr.AieTileNumRows); + std::iota(res.begin(), res.end(), configPtr.AieTileRowStart); + } + return res; +} + bool AMDAIEDeviceModel::hasLegalMemAffinity(uint8_t coreCol, uint8_t coreRow, uint8_t memCol, uint8_t memRow) const { @@ -483,6 +499,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE1_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE1_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE1_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE1_SS_MSEL_MAX; @@ -507,6 +524,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2302: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; @@ -530,6 +548,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2802: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; @@ -557,6 +576,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE2IPU_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE2IPU_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE2IPU_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE2IPU_SS_MSEL_MAX; @@ -603,6 +623,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::npu4: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE_STRIXB0_MEM_TILE_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE_STRIXB0_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE_STRIXB0_SS_MSEL_MAX; diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index a45f798ad..548d18c5a 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -228,6 +228,12 @@ struct AMDAIEDeviceModel { /// retrieved in another way before adding new fields to this struct. struct AMDAIEDeviceConfig { + /////////////////////////////////////// + // AIE Array configuration constants // + /////////////////////////////////////// + /// The number of shim tile rows. Not found in aie-rt data structures, but + /// provided as `XAIE_SHIM_NUM_ROWS`. + uint8_t shimTileNumRows{1}; /// Set default minimum stride bitwidth/addressing granularity to 32 bits as /// this is the value for all current architecture versions. uint8_t minStrideBitWidth{32}; @@ -334,6 +340,8 @@ struct AMDAIEDeviceModel { uint32_t getMemTileSize(uint8_t col, uint8_t row) const; uint32_t getCoreTileLocalMemorySize() const; + SmallVector getMemSpaceRows(uint8_t memSpace) const; + uint32_t getNumBDs(uint8_t col, uint8_t row) const; uint32_t getNumSourceSwitchBoxConnections(uint8_t col, uint8_t row, @@ -356,7 +364,9 @@ struct AMDAIEDeviceModel { return deviceConfig.vectorLoadStoreAlignmentBits; } - uint32_t getMaxVectorSizeBits() const { return deviceConfig.maxVectorSizeBits; } + uint32_t getMaxVectorSizeBits() const { + return deviceConfig.maxVectorSizeBits; + } uint32_t getShiftOperandBits() const { return deviceConfig.shiftOperandBits; } From 8c735372cf6a4c1641323c97ce5612272787764e Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Thu, 21 Nov 2024 11:58:08 -0800 Subject: [PATCH 2/2] Review --- .../Transforms/AMDAIEAssignTiles.cpp | 72 +++++++++---------- .../AMDAIEDistributeCoresAndObjectFifos.cpp | 5 +- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 3 +- .../Transforms/test/assign_tiles.mlir | 2 +- 4 files changed, 37 insertions(+), 45 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp index cc54e481b..7b4acc0b1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp @@ -85,36 +85,34 @@ LogicalResult clearNonLocalTiles(RewriterBase &rewriter, Operation *op) { return success(); } -/// Utility to duplicate global objFifos for each strided copy-like operation -/// user to allow global logical objectFifos to be assigned to different tile -/// locations. +/// Utility to duplicate global objectFifos (L3) for each strided copy-like +/// operation user to allow global logical objectFifos to be assigned to +/// different tile locations. LogicalResult duplicateGlobalObjFifos(RewriterBase &rewriter, Operation *op) { op->walk([&](AMDAIE::DoublyStridedCopyOpInterface copyOp) { auto source = dyn_cast_if_present( copyOp.getSource().getDefiningOp()); auto target = dyn_cast_if_present( copyOp.getTarget().getDefiningOp()); + auto createNewObjFifoAndReplaceUsesFrom = + [&](AMDAIE::LogicalObjectFifoFromMemrefOp oldObjFifo) { + rewriter.setInsertionPoint(copyOp); + auto newObjFifo = + rewriter.create( + rewriter.getUnknownLoc(), + cast(oldObjFifo.getOutput().getType()), + oldObjFifo.getMemref()); + rewriter.replaceUsesWithIf( + oldObjFifo.getOutput(), newObjFifo.getOutput(), + [&](OpOperand &use) { + return use.getOwner() == copyOp.getOperation(); + }); + }; if (source && source.getMemorySpaceAsUInt() == 0) { - rewriter.setInsertionPoint(copyOp); - auto newSource = rewriter.create( - rewriter.getUnknownLoc(), - cast(source.getOutput().getType()), - source.getMemref()); - rewriter.replaceUsesWithIf( - source.getOutput(), newSource.getOutput(), [&](OpOperand &use) { - return use.getOwner() == copyOp.getOperation(); - }); + createNewObjFifoAndReplaceUsesFrom(source); } if (target && target.getMemorySpaceAsUInt() == 0) { - rewriter.setInsertionPoint(copyOp); - auto newTarget = rewriter.create( - rewriter.getUnknownLoc(), - cast(target.getOutput().getType()), - target.getMemref()); - rewriter.replaceUsesWithIf( - target.getOutput(), newTarget.getOutput(), [&](OpOperand &use) { - return use.getOwner() == copyOp.getOperation(); - }); + createNewObjFifoAndReplaceUsesFrom(target); } }); return success(); @@ -236,7 +234,8 @@ class FillTiles tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); for (AMDAIE::TileOp tile : tiles) { std::optional column = getConstantIntValue(tile.getCol()); - if (!column) return tile.emitOpError() << "found non-constant column"; + if (!column) + return rewriter.notifyMatchFailure(tile, "found non-constant column"); tileLocations.insert(std::make_pair(column.value(), row)); } return success(); @@ -309,8 +308,8 @@ LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, } LLVM_DEBUG(llvm::dbgs() << "After fillTiles: \n" << *op << "\n"); - // Keep track of the buffer usage on tiles to try distributing buffers equally - // over available tiles. + // Keep track of the buffer usage on tiles to try distributing buffers evenly + // over available tile resources. DenseMap tileLocToUsage; auto tileLocAndUsageCmp = [&](AMDAIE::TileOp a, AMDAIE::TileOp b) -> bool { int64_t colA = getConstantIndexOrAssert(a.getCol()); @@ -347,7 +346,9 @@ LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, AMDAIE::TileOp assignedTileOp = *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp); - // Increase usage of the chosen tile. + // Increase usage of the chosen tile as a new logical objectFifo will be + // assigned to it. This allows distributing the logical objectFifos + // evenly across the available tile resources. int64_t col = getConstantIndexOrAssert(assignedTileOp.getCol()); int64_t row = getConstantIndexOrAssert(assignedTileOp.getRow()); tileLocToUsage[TileLoc(col, row)] += 1; @@ -386,44 +387,35 @@ void AMDAIEAssignTilesPass::runOnOperation() { if (!maybeDevice) { parentOp->emitOpError() << "has no AMDAIEDevice in the target attribute configuration. This " - "device-specific information is required to determine when loops " - "can be subsumed into DMA operations, and must be attached to a " - "containing ModuleOp."; + "device-specific information is required to looking up column and " + "row related information, and must be attached to a containing " + "ModuleOp."; return signalPassFailure(); } AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); - // Assign tile locations to logical objectfifos on local (L1) memory. + // Assign tile locations to logical objectFifos on local (L1) memory. if (failed(assignLocalTiles(rewriter, parentOp))) { parentOp->emitOpError() << "local tile assignment failed"; return signalPassFailure(); } - if (failed(verify(parentOp, true))) { - return signalPassFailure(); - } LLVM_DEBUG(llvm::dbgs() << "After assignLocalTiles: \n" << *parentOp << "\n"); - // Duplicate global objFifos for each strided copy-like operation user to + // Duplicate global objectFifos for each strided copy-like operation user to // allow global logical objectFifos to be assigned to different tile // locations. if (failed(duplicateGlobalObjFifos(rewriter, parentOp))) { parentOp->emitOpError() << "failed duplicating global object fifos"; return signalPassFailure(); } - if (failed(verify(parentOp, true))) { - return signalPassFailure(); - } LLVM_DEBUG(llvm::dbgs() << "After duplicateGlobalObjFifos: \n" << *parentOp << "\n"); - // Assign tile locations to logical objectfifos on non-local (not L1) memory. + // Assign tile locations to logical objectFifos on non-local (not L1) memory. if (failed(assignNonLocalTiles(rewriter, parentOp, deviceModel))) { parentOp->emitOpError() << "local tile assignment failed"; return signalPassFailure(); } - if (failed(verify(parentOp, true))) { - return signalPassFailure(); - } LLVM_DEBUG(llvm::dbgs() << "After assignNonLocalTiles: \n" << *parentOp << "\n"); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index 750fcca7b..26a269935 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -447,9 +447,8 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { if (!maybeDevice) { moduleOp->emitOpError() << "has no AMDAIEDevice in the target attribute configuration. This " - "device-specific information is required to determine when loops " - "can be subsumed into DMA operations, and must be attached to a " - "containing ModuleOp."; + "device-specific information is required for tile assignment " + "purposes, and must be attached to a containing ModuleOp."; return signalPassFailure(); } AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 8fcdc767c..26ec9bc46 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -68,7 +68,8 @@ def AMDAIEAssignPacketIds : } def AMDAIEAssignTiles : Pass<"iree-amdaie-assign-tiles", ""> { - let summary = "Assign physical tile locations to logical objFifos."; + let summary = "Assign physical tile locations to logical objectFifos. " + "Existing assignments will be ignored/replaced."; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignTilesPass()"; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir index ca6ee9b93..10bd42978 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir @@ -306,7 +306,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// Test duplicate global logical objFifos. +// Test duplicate global logical objectFifos (L3). // CHECK-LABEL: @duplicate_global_object_fifos // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index