diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index 0d2e6c5821..6e9af55a07 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -1061,7 +1061,7 @@ struct AIEObjectFifoStatefulTransformPass builder.getUnknownLoc(), globalNextIndex, ValueRange(ArrayRef({index.getResult()}))); Value val = builder.create( - oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize())); + oldCounter.getLoc(), builder.getI32IntegerAttr(relOp.getSize())); Value sum = builder.create(val.getLoc(), oldCounter, val); Value newCounter = builder.create(sum.getLoc(), sum, size); builder.create(size.getLoc(), newCounter, globalNextIndex, @@ -1091,7 +1091,7 @@ struct AIEObjectFifoStatefulTransformPass builder.setInsertionPoint(coreOp); auto memrefTy = MemRefType::get(SmallVector{(int64_t)fifoSizes.size()}, - builder.getIndexType()); + builder.getI32Type()); auto globalNextIndex = builder.create( builder.getUnknownLoc(), memrefTy, coreOp.getTile(), /*sym_name*/ nullptr, /*address*/ nullptr, @@ -1109,14 +1109,14 @@ struct AIEObjectFifoStatefulTransformPass int index = 0; builder.setInsertionPointToStart(&(coreOp.getBody().front())); Value initVal = builder.create( - builder.getUnknownLoc(), builder.getIndexAttr(0)); + builder.getUnknownLoc(), builder.getI32IntegerAttr(0)); for (auto i : fifoSizes) { auto indexOp = builder.create( initVal.getLoc(), builder.getIndexAttr(index)); globalIndices[i.first] = indexOp; index++; auto size = builder.create( - indexOp.getLoc(), builder.getIndexAttr(i.second)); + indexOp.getLoc(), builder.getI32IntegerAttr(i.second)); constantSizes[i.first] = size; builder.create( size.getLoc(), initVal, globalNextIndex, @@ -1153,10 +1153,13 @@ struct AIEObjectFifoStatefulTransformPass // Create a switch for each subview access builder.setInsertionPointAfter(accessOp); - auto switchIndex = builder.create( + auto switchIndexAsInteger = builder.create( builder.getUnknownLoc(), globalNextIndex, ValueRange( ArrayRef({globalIndices[{createOp, port}].getResult()}))); + auto switchIndex = builder.create( + builder.getUnknownLoc(), builder.getIndexType(), + switchIndexAsInteger); unsigned caseRegionCounts = fifoSizes[{createOp, port}]; SmallVector caseValues; for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) { diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py index 8c41a9868e..7c2b664a65 100644 --- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py +++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py @@ -9,10 +9,11 @@ # # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o # RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags -# RUN: %run_on_npu ./test.exe | FileCheck %s +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s # CHECK: PASS! + import numpy as np from aie.dialects.aie import * diff --git a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py index 03a25b90db..19dc7c6e3c 100644 --- a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py +++ b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py @@ -9,7 +9,7 @@ # # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o # RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags # RUN: %run_on_npu ./test.exe | FileCheck %s # CHECK: PASS! diff --git a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py index 3f04ed0f1f..4814d27dae 100644 --- a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py +++ b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py @@ -9,7 +9,7 @@ # # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o # RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags # RUN: %run_on_npu ./test.exe | FileCheck %s # CHECK: PASS! diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py index 8b91d2e434..129b69eae5 100644 --- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py @@ -9,10 +9,11 @@ # # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o # RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags # RUN: %run_on_npu ./test.exe | FileCheck %s -# XFAIL: * +# CHECK: PASS! + from aie.dialects.aie import * from aie.dialects.aiex import * from aie.helpers.dialects.ext.scf import _for as range_ diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp index 648924ac4f..0fb9cfa7d4 100644 --- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp @@ -28,6 +28,7 @@ #define INPUT_SIZE (100 * sizeof(int)) // in bytes #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes #define WIDTH_SIZE (10 * sizeof(int)) // in bytes + #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir new file mode 100644 index 0000000000..87197925b1 --- /dev/null +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir @@ -0,0 +1,201 @@ +module { + aie.device(npu1_1col) { + memref.global "public" @output_fifo_cons : memref<10xi32> + memref.global "public" @output_fifo : memref<10xi32> + memref.global "public" @input_fifo_cons : memref<10xi32> + memref.global "public" @input_fifo : memref<10xi32> + func.func private @add_10_i32(memref<10xi32>, memref<10xi32>, memref<10xi32>) + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 0 : i32, sym_name = "output_fifo_cons_prod_lock"} + %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"} + %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> + %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> + %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"} + %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"} + %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> + %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> + %input_fifo_cons_buff_2 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_2"} : memref<10xi32> + %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 3 : i32, sym_name = "input_fifo_cons_prod_lock"} + %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"} + %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 0 : i32, sym_name = "input_fifo_prod_lock"} + %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"} + aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) + %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c0_0 = arith.constant 0 : index + %c2_i32 = arith.constant 2 : i32 + memref.store %c0_i32, %buffer_0_2[%c0_0] : memref<2xi32> + %c1 = arith.constant 1 : index + %c3_i32 = arith.constant 3 : i32 + memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32> + %c0_1 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %c1_2 = arith.constant 1 : index + scf.for %arg0 = %c0_1 to %c10 step %c1_2 { + aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) + %0 = memref.load %buffer_0_2[%c0_0] : memref<2xi32> + %1 = arith.index_cast %0 : i32 to index + %2 = scf.index_switch %1 -> memref<10xi32> + case 0 { + scf.yield %output_fifo_buff_0 : memref<10xi32> + } + case 1 { + scf.yield %output_fifo_buff_1 : memref<10xi32> + } + default { + scf.yield %output_fifo_buff_0 : memref<10xi32> + } + %3 = arith.cmpi eq, %arg0, %c0_1 : index + %4 = arith.subi %c10, %c1_2 : index + %5 = arith.cmpi eq, %arg0, %4 : index + scf.if %3 { + aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) + %8 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %9 = arith.index_cast %8 : i32 to index + %10 = scf.index_switch %9 -> memref<10xi32> + case 0 { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + case 1 { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + case 2 { + scf.yield %input_fifo_cons_buff_2 : memref<10xi32> + } + default { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + func.call @add_10_i32(%10, %10, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + } else { + scf.if %5 { + aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2) + %8 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %9 = arith.index_cast %8 : i32 to index + %10 = scf.index_switch %9 -> memref<10xi32> + case 0 { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + case 1 { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + case 2 { + scf.yield %input_fifo_cons_buff_2 : memref<10xi32> + } + default { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + %11 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %12 = arith.index_cast %11 : i32 to index + %13 = scf.index_switch %12 -> memref<10xi32> + case 0 { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + case 1 { + scf.yield %input_fifo_cons_buff_2 : memref<10xi32> + } + case 2 { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + default { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + aie.use_lock(%input_fifo_cons_prod_lock, Release, 2) + %14 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %c2_4 = arith.constant 2 : i32 + %15 = arith.addi %14, %c2_4 : i32 + %16 = arith.remsi %15, %c3_i32 : i32 + memref.store %16, %buffer_0_2[%c1] : memref<2xi32> + } else { + %8 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %9 = arith.index_cast %8 : i32 to index + %10 = scf.index_switch %9 -> memref<10xi32> + case 0 { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + case 1 { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + case 2 { + scf.yield %input_fifo_cons_buff_2 : memref<10xi32> + } + default { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + %11 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %12 = arith.index_cast %11 : i32 to index + %13 = scf.index_switch %12 -> memref<10xi32> + case 0 { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + case 1 { + scf.yield %input_fifo_cons_buff_2 : memref<10xi32> + } + case 2 { + scf.yield %input_fifo_cons_buff_0 : memref<10xi32> + } + default { + scf.yield %input_fifo_cons_buff_1 : memref<10xi32> + } + func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () + aie.use_lock(%input_fifo_cons_prod_lock, Release, 1) + %14 = memref.load %buffer_0_2[%c1] : memref<2xi32> + %c1_4 = arith.constant 1 : i32 + %15 = arith.addi %14, %c1_4 : i32 + %16 = arith.remsi %15, %c3_i32 : i32 + memref.store %16, %buffer_0_2[%c1] : memref<2xi32> + } + } + aie.use_lock(%output_fifo_cons_lock, Release, 1) + %6 = memref.load %buffer_0_2[%c0_0] : memref<2xi32> + %c1_3 = arith.constant 1 : i32 + %7 = arith.addi %6, %c1_3 : i32 + %8 = arith.remsi %7, %c2_i32 : i32 + memref.store %8, %buffer_0_2[%c0_0] : memref<2xi32> + } + aie.end + } {link_with = "kernel.o"} + aie.shim_dma_allocation @input_fifo(MM2S, 0, 0) + aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + } + aie.shim_dma_allocation @output_fifo(S2MM, 0, 0) + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb4) + ^bb1: // 2 preds: ^bb0, ^bb3 + aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%input_fifo_cons_buff_0 : memref<10xi32>, 0, 10) + aie.use_lock(%input_fifo_cons_cons_lock, Release, 1) + aie.next_bd ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%input_fifo_cons_buff_1 : memref<10xi32>, 0, 10) + aie.use_lock(%input_fifo_cons_cons_lock, Release, 1) + aie.next_bd ^bb3 + ^bb3: // pred: ^bb2 + aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%input_fifo_cons_buff_2 : memref<10xi32>, 0, 10) + aie.use_lock(%input_fifo_cons_cons_lock, Release, 1) + aie.next_bd ^bb1 + ^bb4: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 0, ^bb5, ^bb7) + ^bb5: // 2 preds: ^bb4, ^bb6 + aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%output_fifo_buff_0 : memref<10xi32>, 0, 10) + aie.use_lock(%output_fifo_prod_lock, Release, 1) + aie.next_bd ^bb6 + ^bb6: // pred: ^bb5 + aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%output_fifo_buff_1 : memref<10xi32>, 0, 10) + aie.use_lock(%output_fifo_prod_lock, Release, 1) + aie.next_bd ^bb5 + ^bb7: // pred: ^bb4 + aie.end + } + } +} diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py deleted file mode 100644 index d7eae0bc31..0000000000 --- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 AMD Inc. - -# REQUIRES: ryzen_ai, valid_xchess_license -# -# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o -# RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir -# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags -# RUN: %run_on_npu ./test.exe | FileCheck %s -# XFAIL: * -import numpy as np - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.helpers.dialects.ext.scf import _for as range_ -from aie.extras.context import mlir_mod_ctx - -N = 100 -n_rows = 10 -dev = AIEDevice.npu1_1col -col = 0 - - -def sliding_window(): - with mlir_mod_ctx() as ctx: - - @device(dev) - def device_body(): - subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]] - - # Tile declarations - ShimTile = tile(col, 0) - ComputeTile = tile(col, 2) - - # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile, 3, subtensor_ty) - of_out = object_fifo("out", ComputeTile, ShimTile, 2, subtensor_ty) - - # AIE Core Function declarations - add_10_i32 = external_func( - "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty] - ) - - # Set up compute tiles - @core(ComputeTile, "kernel.o") - def core_body(): - for i in range_(10): - elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) - if i == 0: - elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1) - add_10_i32(elemInPre, elemInPre, elemOut) - elif i == 9: - elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2) - add_10_i32(elemsInPost[0], elemsInPost[1], elemOut) - of_in.release(ObjectFifoPort.Consume, 2) - else: - elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2) - add_10_i32(elemsIn[0], elemsIn[1], elemOut) - of_in.release(ObjectFifoPort.Consume, 1) - - of_out.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - tensor_ty = np.ndarray[(N,), np.dtype[np.int32]] - - @runtime_sequence(tensor_ty, tensor_ty) - def sequence(A, C): - npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N]) - dma_wait(of_out) - - print(ctx.module) - - -sliding_window() diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit new file mode 100644 index 0000000000..6220c2ec10 --- /dev/null +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, valid_xchess_license +// +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +// RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp index 648924ac4f..0fb9cfa7d4 100644 --- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp @@ -28,6 +28,7 @@ #define INPUT_SIZE (100 * sizeof(int)) // in bytes #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes #define WIDTH_SIZE (10 * sizeof(int)) // in bytes + #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py index 4fba84bb83..a48d6149ba 100644 --- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py +++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py @@ -9,10 +9,11 @@ # # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o # RUN: %python %S/aie2.py > ./aie2.mlir -# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags # RUN: %run_on_npu ./test.exe | FileCheck %s -# XFAIL: * +# CHECK: PASS! + import numpy as np from aie.dialects.aie import * diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp index 648924ac4f..0fb9cfa7d4 100644 --- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp +++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp @@ -28,6 +28,7 @@ #define INPUT_SIZE (100 * sizeof(int)) // in bytes #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes #define WIDTH_SIZE (10 * sizeof(int)) // in bytes + #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir index e91c1f9f21..16c028b6c3 100644 --- a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir +++ b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir @@ -11,20 +11,21 @@ // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s // CHECK: %core_0_2 = aie.core(%tile_0_2) { +// CHECK: %c0_i32 = arith.constant 0 : i32 // CHECK: %c0 = arith.constant 0 : index -// CHECK: %c0_0 = arith.constant 0 : index -// CHECK: %c2 = arith.constant 2 : index -// CHECK: memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c2_i32 = arith.constant 2 : i32 +// CHECK: memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32> // CHECK: %c1 = arith.constant 1 : index -// CHECK: %c2_1 = arith.constant 2 : index -// CHECK: memref.store %c0, %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %c0_2 = arith.constant 0 : index -// CHECK: %c1_3 = arith.constant 1 : index +// CHECK: %c2_i32_0 = arith.constant 2 : i32 +// CHECK: memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %c0_1 = arith.constant 0 : index +// CHECK: %c1_2 = arith.constant 1 : index // CHECK: %c10 = arith.constant 10 : index -// CHECK: scf.for %arg0 = %c0_2 to %c10 step %c1_3 { +// CHECK: scf.for %arg0 = %c0_1 to %c10 step %c1_2 { // CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) -// CHECK: %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %1 = scf.index_switch %0 -> memref<10xi32> +// CHECK: %0 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %1 = arith.index_cast %0 : i32 to index +// CHECK: %2 = scf.index_switch %1 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } @@ -35,8 +36,9 @@ // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } // CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) -// CHECK: %2 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %3 = scf.index_switch %2 -> memref<10xi32> +// CHECK: %3 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %4 = arith.index_cast %3 : i32 to index +// CHECK: %5 = scf.index_switch %4 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } @@ -46,19 +48,19 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } -// CHECK: func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> () +// CHECK: func.call @passthrough_10_i32(%5, %2) : (memref<10xi32>, memref<10xi32>) -> () // CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 1) -// CHECK: %4 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %c1_4 = arith.constant 1 : index -// CHECK: %5 = arith.addi %4, %c1_4 : index -// CHECK: %6 = arith.remsi %5, %c2_1 : index -// CHECK: memref.store %6, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %6 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %c1_i32 = arith.constant 1 : i32 +// CHECK: %7 = arith.addi %6, %c1_i32 : i32 +// CHECK: %8 = arith.remsi %7, %c2_i32_0 : i32 +// CHECK: memref.store %8, %buffer_0_2[%c1] : memref<2xi32> // CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) -// CHECK: %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %c1_5 = arith.constant 1 : index -// CHECK: %8 = arith.addi %7, %c1_5 : index -// CHECK: %9 = arith.remsi %8, %c2 : index -// CHECK: memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %9 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %c1_i32_3 = arith.constant 1 : i32 +// CHECK: %10 = arith.addi %9, %c1_i32_3 : i32 +// CHECK: %11 = arith.remsi %10, %c2_i32 : i32 +// CHECK: memref.store %11, %buffer_0_2[%c0] : memref<2xi32> // CHECK: } // CHECK: aie.end // CHECK: } {dynamic_objfifo_lowering = true} @@ -78,7 +80,7 @@ // CHECK: func.call @passthrough_10_i32(%input_fifo2_cons_buff_1, %output_fifo2_buff_1) : (memref<10xi32>, memref<10xi32>) -> () // CHECK: aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1) // CHECK: aie.use_lock(%output_fifo2_cons_lock, Release, 1) -// CHECK: } +// CHECK: } // CHECK: aie.end // CHECK: } // CHECK: aie.shim_dma_allocation @input_fifo(MM2S, 0, 0) diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir index c169c8472e..087b8e5a2a 100644 --- a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir +++ b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir @@ -35,21 +35,22 @@ // CHECK: %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"} // CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) // CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) -// CHECK: %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> +// CHECK: %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> // CHECK: %core_0_2 = aie.core(%tile_0_2) { +// CHECK: %c0_i32 = arith.constant 0 : i32 // CHECK: %c0 = arith.constant 0 : index -// CHECK: %c0_0 = arith.constant 0 : index -// CHECK: %c2 = arith.constant 2 : index -// CHECK: memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %c2_i32 = arith.constant 2 : i32 +// CHECK: memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32> // CHECK: %c1 = arith.constant 1 : index -// CHECK: %c3 = arith.constant 3 : index -// CHECK: memref.store %c0, %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %c0_1 = arith.constant 0 : index -// CHECK: %c1_2 = arith.constant 1 : index +// CHECK: %c3_i32 = arith.constant 3 : i32 +// CHECK: memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %c0_0 = arith.constant 0 : index +// CHECK: %c1_1 = arith.constant 1 : index // CHECK: %c9 = arith.constant 9 : index // CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) -// CHECK: %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %1 = scf.index_switch %0 -> memref<10xi32> +// CHECK: %0 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %1 = arith.index_cast %0 : i32 to index +// CHECK: %2 = scf.index_switch %1 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } @@ -60,8 +61,9 @@ // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } // CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) -// CHECK: %2 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %3 = scf.index_switch %2 -> memref<10xi32> +// CHECK: %3 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %4 = arith.index_cast %3 : i32 to index +// CHECK: %5 = scf.index_switch %4 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } @@ -74,17 +76,18 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } -// CHECK: func.call @add_10_i32(%3, %3, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: func.call @add_10_i32(%5, %5, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () // CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) -// CHECK: %4 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %c1_3 = arith.constant 1 : index -// CHECK: %5 = arith.addi %4, %c1_3 : index -// CHECK: %6 = arith.remsi %5, %c2 : index -// CHECK: memref.store %6, %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: scf.for %arg0 = %c0_1 to %c9 step %c1_2 { +// CHECK: %6 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %c1_i32 = arith.constant 1 : i32 +// CHECK: %7 = arith.addi %6, %c1_i32 : i32 +// CHECK: %8 = arith.remsi %7, %c2_i32 : i32 +// CHECK: memref.store %8, %buffer_0_2[%c0] : memref<2xi32> +// CHECK: scf.for %arg0 = %c0_0 to %c9 step %c1_1 { // CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) -// CHECK: %19 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %20 = scf.index_switch %19 -> memref<10xi32> +// CHECK: %24 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %25 = arith.index_cast %24 : i32 to index +// CHECK: %26 = scf.index_switch %25 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } @@ -95,8 +98,9 @@ // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } // CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) -// CHECK: %21 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %22 = scf.index_switch %21 -> memref<10xi32> +// CHECK: %27 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %28 = arith.index_cast %27 : i32 to index +// CHECK: %29 = scf.index_switch %28 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } @@ -109,8 +113,9 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } -// CHECK: %23 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %24 = scf.index_switch %23 -> memref<10xi32> +// CHECK: %30 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %31 = arith.index_cast %30 : i32 to index +// CHECK: %32 = scf.index_switch %31 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> // CHECK: } @@ -123,23 +128,24 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> // CHECK: } -// CHECK: func.call @add_10_i32(%22, %24, %20) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: func.call @add_10_i32(%29, %32, %26) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () // CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 1) -// CHECK: %25 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %c1_6 = arith.constant 1 : index -// CHECK: %26 = arith.addi %25, %c1_6 : index -// CHECK: %27 = arith.remsi %26, %c3 : index -// CHECK: memref.store %27, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %33 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %c1_i32_4 = arith.constant 1 : i32 +// CHECK: %34 = arith.addi %33, %c1_i32_4 : i32 +// CHECK: %35 = arith.remsi %34, %c3_i32 : i32 +// CHECK: memref.store %35, %buffer_0_2[%c1] : memref<2xi32> // CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) -// CHECK: %28 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %c1_7 = arith.constant 1 : index -// CHECK: %29 = arith.addi %28, %c1_7 : index -// CHECK: %30 = arith.remsi %29, %c2 : index -// CHECK: memref.store %30, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %36 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %c1_i32_5 = arith.constant 1 : i32 +// CHECK: %37 = arith.addi %36, %c1_i32_5 : i32 +// CHECK: %38 = arith.remsi %37, %c2_i32 : i32 +// CHECK: memref.store %38, %buffer_0_2[%c0] : memref<2xi32> // CHECK: } // CHECK: aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1) -// CHECK: %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %8 = scf.index_switch %7 -> memref<10xi32> +// CHECK: %9 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %10 = arith.index_cast %9 : i32 to index +// CHECK: %11 = scf.index_switch %10 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } @@ -150,8 +156,9 @@ // CHECK: scf.yield %output_fifo_buff_0 : memref<10xi32> // CHECK: } // CHECK: aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1) -// CHECK: %9 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %10 = scf.index_switch %9 -> memref<10xi32> +// CHECK: %12 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %13 = arith.index_cast %12 : i32 to index +// CHECK: %14 = scf.index_switch %13 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } @@ -164,8 +171,9 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_0 : memref<10xi32> // CHECK: } -// CHECK: %11 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %12 = scf.index_switch %11 -> memref<10xi32> +// CHECK: %15 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %16 = arith.index_cast %15 : i32 to index +// CHECK: %17 = scf.index_switch %16 -> memref<10xi32> // CHECK: case 0 { // CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> // CHECK: } @@ -178,19 +186,19 @@ // CHECK: default { // CHECK: scf.yield %input_fifo_cons_buff_1 : memref<10xi32> // CHECK: } -// CHECK: func.call @add_10_i32(%10, %12, %8) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () +// CHECK: func.call @add_10_i32(%14, %17, %11) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> () // CHECK: aie.use_lock(%input_fifo_cons_prod_lock, Release, 2) -// CHECK: %13 = memref.load %buffer_0_2[%c1] : memref<2xindex> -// CHECK: %c2_4 = arith.constant 2 : index -// CHECK: %14 = arith.addi %13, %c2_4 : index -// CHECK: %15 = arith.remsi %14, %c3 : index -// CHECK: memref.store %15, %buffer_0_2[%c1] : memref<2xindex> +// CHECK: %18 = memref.load %buffer_0_2[%c1] : memref<2xi32> +// CHECK: %c2_i32_2 = arith.constant 2 : i32 +// CHECK: %19 = arith.addi %18, %c2_i32_2 : i32 +// CHECK: %20 = arith.remsi %19, %c3_i32 : i32 +// CHECK: memref.store %20, %buffer_0_2[%c1] : memref<2xi32> // CHECK: aie.use_lock(%output_fifo_cons_lock, Release, 1) -// CHECK: %16 = memref.load %buffer_0_2[%c0_0] : memref<2xindex> -// CHECK: %c1_5 = arith.constant 1 : index -// CHECK: %17 = arith.addi %16, %c1_5 : index -// CHECK: %18 = arith.remsi %17, %c2 : index -// CHECK: memref.store %18, %buffer_0_2[%c0_0] : memref<2xindex> +// CHECK: %21 = memref.load %buffer_0_2[%c0] : memref<2xi32> +// CHECK: %c1_i32_3 = arith.constant 1 : i32 +// CHECK: %22 = arith.addi %21, %c1_i32_3 : i32 +// CHECK: %23 = arith.remsi %22, %c2_i32 : i32 +// CHECK: memref.store %23, %buffer_0_2[%c0] : memref<2xi32> // CHECK: aie.end // CHECK: } // CHECK: aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)