Skip to content

Commit

Permalink
Fixing dynamic objectFifo (#1907)
Browse files Browse the repository at this point in the history
Co-authored-by: AndraBisca <[email protected]>
  • Loading branch information
pvasireddy-amd and abisca authored Dec 6, 2024
1 parent 12d3878 commit ea74c75
Show file tree
Hide file tree
Showing 14 changed files with 320 additions and 170 deletions.
13 changes: 8 additions & 5 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ struct AIEObjectFifoStatefulTransformPass
builder.getUnknownLoc(), globalNextIndex,
ValueRange(ArrayRef({index.getResult()})));
Value val = builder.create<arith::ConstantOp>(
oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize()));
oldCounter.getLoc(), builder.getI32IntegerAttr(relOp.getSize()));
Value sum = builder.create<arith::AddIOp>(val.getLoc(), oldCounter, val);
Value newCounter = builder.create<arith::RemSIOp>(sum.getLoc(), sum, size);
builder.create<memref::StoreOp>(size.getLoc(), newCounter, globalNextIndex,
Expand Down Expand Up @@ -1091,7 +1091,7 @@ struct AIEObjectFifoStatefulTransformPass
builder.setInsertionPoint(coreOp);
auto memrefTy =
MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
builder.getIndexType());
builder.getI32Type());
auto globalNextIndex = builder.create<BufferOp>(
builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
/*sym_name*/ nullptr, /*address*/ nullptr,
Expand All @@ -1109,14 +1109,14 @@ struct AIEObjectFifoStatefulTransformPass
int index = 0;
builder.setInsertionPointToStart(&(coreOp.getBody().front()));
Value initVal = builder.create<arith::ConstantOp>(
builder.getUnknownLoc(), builder.getIndexAttr(0));
builder.getUnknownLoc(), builder.getI32IntegerAttr(0));
for (auto i : fifoSizes) {
auto indexOp = builder.create<arith::ConstantOp>(
initVal.getLoc(), builder.getIndexAttr(index));
globalIndices[i.first] = indexOp;
index++;
auto size = builder.create<arith::ConstantOp>(
indexOp.getLoc(), builder.getIndexAttr(i.second));
indexOp.getLoc(), builder.getI32IntegerAttr(i.second));
constantSizes[i.first] = size;
builder.create<memref::StoreOp>(
size.getLoc(), initVal, globalNextIndex,
Expand Down Expand Up @@ -1153,10 +1153,13 @@ struct AIEObjectFifoStatefulTransformPass

// Create a switch for each subview access
builder.setInsertionPointAfter(accessOp);
auto switchIndex = builder.create<memref::LoadOp>(
auto switchIndexAsInteger = builder.create<memref::LoadOp>(
builder.getUnknownLoc(), globalNextIndex,
ValueRange(
ArrayRef({globalIndices[{createOp, port}].getResult()})));
auto switchIndex = builder.create<arith::IndexCastOp>(
builder.getUnknownLoc(), builder.getIndexType(),
switchIndexAsInteger);
unsigned caseRegionCounts = fifoSizes[{createOp, port}];
SmallVector<int64_t, 4> caseValues;
for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
Expand Down
5 changes: 3 additions & 2 deletions test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
# CHECK: PASS!

import numpy as np

from aie.dialects.aie import *
Expand Down
2 changes: 1 addition & 1 deletion test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!
Expand Down
2 changes: 1 addition & 1 deletion test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!
Expand Down
5 changes: 3 additions & 2 deletions test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# XFAIL: *
# CHECK: PASS!

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_
Expand Down
1 change: 1 addition & 0 deletions test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#define INPUT_SIZE (100 * sizeof(int)) // in bytes
#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
#define WIDTH_SIZE (10 * sizeof(int)) // in bytes

#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE

Expand Down
201 changes: 201 additions & 0 deletions test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
module {
aie.device(npu1_1col) {
memref.global "public" @output_fifo_cons : memref<10xi32>
memref.global "public" @output_fifo : memref<10xi32>
memref.global "public" @input_fifo_cons : memref<10xi32>
memref.global "public" @input_fifo : memref<10xi32>
func.func private @add_10_i32(memref<10xi32>, memref<10xi32>, memref<10xi32>)
%tile_0_0 = aie.tile(0, 0)
%tile_0_2 = aie.tile(0, 2)
%output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 0 : i32, sym_name = "output_fifo_cons_prod_lock"}
%output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"}
%output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32>
%output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32>
%output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"}
%output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"}
%input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32>
%input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32>
%input_fifo_cons_buff_2 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_2"} : memref<10xi32>
%input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 3 : i32, sym_name = "input_fifo_cons_prod_lock"}
%input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"}
%input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 0 : i32, sym_name = "input_fifo_prod_lock"}
%input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
%buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32>
%core_0_2 = aie.core(%tile_0_2) {
%c0_i32 = arith.constant 0 : i32
%c0_0 = arith.constant 0 : index
%c2_i32 = arith.constant 2 : i32
memref.store %c0_i32, %buffer_0_2[%c0_0] : memref<2xi32>
%c1 = arith.constant 1 : index
%c3_i32 = arith.constant 3 : i32
memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
%c0_1 = arith.constant 0 : index
%c10 = arith.constant 10 : index
%c1_2 = arith.constant 1 : index
scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
%0 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
%1 = arith.index_cast %0 : i32 to index
%2 = scf.index_switch %1 -> memref<10xi32>
case 0 {
scf.yield %output_fifo_buff_0 : memref<10xi32>
}
case 1 {
scf.yield %output_fifo_buff_1 : memref<10xi32>
}
default {
scf.yield %output_fifo_buff_0 : memref<10xi32>
}
%3 = arith.cmpi eq, %arg0, %c0_1 : index
%4 = arith.subi %c10, %c1_2 : index
%5 = arith.cmpi eq, %arg0, %4 : index
scf.if %3 {
aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
%8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%9 = arith.index_cast %8 : i32 to index
%10 = scf.index_switch %9 -> memref<10xi32>
case 0 {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
case 1 {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
case 2 {
scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
}
default {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
func.call @add_10_i32(%10, %10, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
} else {
scf.if %5 {
aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2)
%8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%9 = arith.index_cast %8 : i32 to index
%10 = scf.index_switch %9 -> memref<10xi32>
case 0 {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
case 1 {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
case 2 {
scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
}
default {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
%11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%12 = arith.index_cast %11 : i32 to index
%13 = scf.index_switch %12 -> memref<10xi32>
case 0 {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
case 1 {
scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
}
case 2 {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
default {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
%14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%c2_4 = arith.constant 2 : i32
%15 = arith.addi %14, %c2_4 : i32
%16 = arith.remsi %15, %c3_i32 : i32
memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
} else {
%8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%9 = arith.index_cast %8 : i32 to index
%10 = scf.index_switch %9 -> memref<10xi32>
case 0 {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
case 1 {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
case 2 {
scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
}
default {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
%11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%12 = arith.index_cast %11 : i32 to index
%13 = scf.index_switch %12 -> memref<10xi32>
case 0 {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
case 1 {
scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
}
case 2 {
scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
}
default {
scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
}
func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
%14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
%c1_4 = arith.constant 1 : i32
%15 = arith.addi %14, %c1_4 : i32
%16 = arith.remsi %15, %c3_i32 : i32
memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
}
}
aie.use_lock(%output_fifo_cons_lock, Release, 1)
%6 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
%c1_3 = arith.constant 1 : i32
%7 = arith.addi %6, %c1_3 : i32
%8 = arith.remsi %7, %c2_i32 : i32
memref.store %8, %buffer_0_2[%c0_0] : memref<2xi32>
}
aie.end
} {link_with = "kernel.o"}
aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)
aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32>
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32>
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
}
aie.shim_dma_allocation @output_fifo(S2MM, 0, 0)
%mem_0_2 = aie.mem(%tile_0_2) {
%0 = aie.dma_start(S2MM, 0, ^bb1, ^bb4)
^bb1: // 2 preds: ^bb0, ^bb3
aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%input_fifo_cons_buff_0 : memref<10xi32>, 0, 10)
aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
aie.next_bd ^bb2
^bb2: // pred: ^bb1
aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%input_fifo_cons_buff_1 : memref<10xi32>, 0, 10)
aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
aie.next_bd ^bb3
^bb3: // pred: ^bb2
aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%input_fifo_cons_buff_2 : memref<10xi32>, 0, 10)
aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
aie.next_bd ^bb1
^bb4: // pred: ^bb0
%1 = aie.dma_start(MM2S, 0, ^bb5, ^bb7)
^bb5: // 2 preds: ^bb4, ^bb6
aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%output_fifo_buff_0 : memref<10xi32>, 0, 10)
aie.use_lock(%output_fifo_prod_lock, Release, 1)
aie.next_bd ^bb6
^bb6: // pred: ^bb5
aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
aie.dma_bd(%output_fifo_buff_1 : memref<10xi32>, 0, 10)
aie.use_lock(%output_fifo_prod_lock, Release, 1)
aie.next_bd ^bb5
^bb7: // pred: ^bb4
aie.end
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, valid_xchess_license
//
// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
// RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
// CHECK: PASS!
Loading

0 comments on commit ea74c75

Please sign in to comment.