diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc index 5745f364dc..c5ed109332 100755 --- a/aie_kernels/aie2/mul.cc +++ b/aie_kernels/aie2/mul.cc @@ -8,10 +8,6 @@ // //===----------------------------------------------------------------------===// -#define __AIENGINE__ 2 -#define NOCPP -#define __AIEARCH__ 20 - #include #include #include diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile old mode 100644 new mode 100755 index 2833afc94f..480eab9a75 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -12,6 +12,7 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common +device = npu targetname = passThroughKernel VPATH := ${srcdir}/../../../aie_kernels/generic data_size = 4096 @@ -31,27 +32,44 @@ all: build/final_${data_size}.xclbin build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${data_size} 0 > $@ + python3 $< ${device} ${data_size} 0 > $@ build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${data_size} ${trace_size} > $@ + python3 $< ${device} ${data_size} ${trace_size} > $@ build/passThrough.cc.o: passThrough.cc mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +endif build/final_trace_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +endif + ${targetname}_${data_size}.exe: ${srcdir}/test.cpp rm -rf _build diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py old mode 100644 new mode 100755 index 2bfdbb3066..ff03ab0bd8 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -16,11 +16,11 @@ import aie.utils.trace as trace_utils -def passthroughKernel(vector_size, trace_size): +def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): # define types vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] @@ -55,8 +55,6 @@ def core_body(): of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - # print(ctx.module.operation.verify()) - @runtime_sequence(vector_ty, vector_ty, vector_ty) def sequence(inTensor, outTensor, notUsed): if trace_size > 0: @@ -85,13 +83,20 @@ def sequence(inTensor, outTensor, notUsed): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - passthroughKernel(vector_size, trace_size) + passthroughKernel(dev, vector_size, trace_size) print(ctx.module) diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py index f41965455c..b2835ff997 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py @@ -16,11 +16,11 @@ import aie.utils.trace as trace_utils -def passthroughKernel(vector_size, trace_size): +def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): # define types vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] @@ -79,13 +79,20 @@ def sequence(inTensor, outTensor, notUsed): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - passthroughKernel(vector_size, trace_size) + passthroughKernel(dev, vector_size, trace_size) print(ctx.module) diff --git a/programming_examples/basic/passthrough_kernel/run_makefile.lit b/programming_examples/basic/passthrough_kernel/run_makefile.lit index e8213c5d18..4c5bc14c4f 100644 --- a/programming_examples/basic/passthrough_kernel/run_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_makefile.lit @@ -3,9 +3,9 @@ // // REQUIRES: ryzen_ai, peano // +// RUN: mkdir -p test +// RUN: cd test // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s -// CHECK: Running... -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py diff --git a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit index 4a4a70e117..c37843fa25 100644 --- a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit +++ b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run_py \ No newline at end of file diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index f93ddb3ac7..0987344b6a 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -40,11 +40,11 @@ def main(opts): if not errors: print("\nPASS!\n") - exit(0) + sys.exit(0) else: print("\nError count: ", errors) print("\nFailed.\n") - exit(-1) + sys.exit(1) if __name__ == "__main__": diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index 84456f497c..9ed598062a 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -14,6 +14,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu targetname = vectorScalar data_size = 4096 trace_size = 8192 @@ -32,19 +33,25 @@ kristof: build/insts_${data_size}.txt build/%.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) ifeq ($(CHESS), true) cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}; else cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}; endif +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/aie_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${data_size} 0 > $@ + python3 $< ${device} ${data_size} 0 > $@ build/aie_trace_${data_size}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${data_size} ${trace_size} > $@ + python3 $< ${device} ${data_size} ${trace_size} > $@ #build/insts_${data_size}.txt: build/final_${data_size}.xclbin build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index 1d367e5aab..0617dafdad 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -16,7 +16,7 @@ import aie.utils.trace as trace_utils -def my_vector_scalar(vector_size, trace_size): +def my_vector_scalar(dev, vector_size, trace_size): N = vector_size N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors @@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size): vectorized = True - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): tensor_ty = np.ndarray[(N,), np.dtype[np.int16]] tile_ty = np.ndarray[(n,), np.dtype[np.int16]] @@ -93,13 +93,20 @@ def sequence(A, F, C): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - my_vector_scalar(vector_size, trace_size) - print(ctx.module) + my_vector_scalar(dev, vector_size, trace_size) +print(ctx.module) diff --git a/programming_examples/basic/vector_scalar_mul/aie2_alt.py b/programming_examples/basic/vector_scalar_mul/aie2_alt.py index 335e966745..6c52fc1c21 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2_alt.py +++ b/programming_examples/basic/vector_scalar_mul/aie2_alt.py @@ -16,7 +16,7 @@ import aie.utils.trace as trace_utils -def my_vector_scalar(vector_size, trace_size): +def my_vector_scalar(dev, vector_size, trace_size): N = vector_size N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors @@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size): vectorized = True - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): tensor_ty = np.ndarray[(N,), np.dtype[np.int16]] tile_ty = np.ndarray[(n,), np.dtype[np.int16]] @@ -97,13 +97,20 @@ def sequence(A, F, C): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - my_vector_scalar(vector_size, trace_size) - print(ctx.module) + my_vector_scalar(dev, vector_size, trace_size) +print(ctx.module) diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_makefile.lit index d298884111..244bff012a 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile.lit @@ -7,9 +7,8 @@ // RUN: cd test_peano // RUN: make -f %S/Makefile clean // RUN: env CHESS=false make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py // RUN: make -f %S/Makefile clean -// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace | FileCheck %s -// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py | FileCheck %s -// CHECK: PASS! +// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace +// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit index edfe402ec1..f0b5578ffa 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env CHESS=true use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! - \ No newline at end of file +// RUN: %run_on_npu make -f %S/Makefile run + diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit index 481b220165..da7102bcfc 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit @@ -7,9 +7,8 @@ // RUN: cd test_chess // RUN: make -f %S/Makefile clean // RUN: env CHESS=true make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py // RUN: make -f %S/Makefile clean -// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace | FileCheck %s -// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py | FileCheck %s -// CHECK: PASS! +// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace +// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py diff --git a/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index 9ffa7dab1d..c91b53307f 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -71,11 +71,11 @@ def main(opts): if not errors: print("\nPASS!\n") - exit(0) + sys.exit(0) else: print("\nError count: ", errors) print("\nFailed.\n") - exit(-1) + sys.exit(1) if __name__ == "__main__": diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 97e1246045..28a782d51c 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -46,6 +46,7 @@ llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python")) run_on_npu = "echo" +run_on_2npu = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -140,9 +141,14 @@ if len(m.groups()) == 3: print("\tmodel:", m.group(3)) config.available_features.add("ryzen_ai") - run_on_npu = ( - f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" - ) + if str(m.group(3)) == "npu1": + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) + if str(m.group(3)) == "npu4": + run_on_2npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) break except: print("Failed to run xrt-smi") @@ -151,6 +157,7 @@ print("xrt not found") config.substitutions.append(("%run_on_npu", run_on_npu)) +config.substitutions.append(("%run_on_2npu", run_on_2npu)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1" diff --git a/programming_examples/lit.site.cfg.py.in b/programming_examples/lit.site.cfg.py.in index 22a367d1fc..3ba7a457f1 100755 --- a/programming_examples/lit.site.cfg.py.in +++ b/programming_examples/lit.site.cfg.py.in @@ -69,6 +69,8 @@ if lit.util.pythonize_bool("@AIETools_AIE_FOUND@"): config.vitis_components.append("AIE") if lit.util.pythonize_bool("@AIETools_AIE2_FOUND@"): config.vitis_components.append("AIE2") +if lit.util.pythonize_bool("@AIETools_AIE2P_FOUND@"): + config.vitis_components.append("AIE2P") # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common index bdde6760b6..2dd70864d3 100644 --- a/programming_examples/makefile-common +++ b/programming_examples/makefile-common @@ -13,6 +13,7 @@ CHESS_FLAGS = -P ${AIE_INCLUDE_DIR} CHESSCCWRAP1_FLAGS = aie -I ${AIETOOLS_DIR}/include CHESSCCWRAP2_FLAGS = aie2 -I ${AIETOOLS_DIR}/include +CHESSCCWRAP2P_FLAGS = aie2p -I ${AIETOOLS_DIR}/include PEANOWRAP2_FLAGS = -O2 -v -std=c++20 --target=aie2-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no) diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile index a3dfaa8646..84b2d710f3 100644 --- a/programming_examples/vision/color_threshold/Makefile +++ b/programming_examples/vision/color_threshold/Makefile @@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu COLORTHRESHOLD_WIDTH = 1920 COLORTHRESHOLD_HEIGHT = 1080 @@ -33,17 +34,28 @@ mlir: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/%.cc.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/aie2_${COLORTHRESHOLD_WIDTH}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@ + python3 $< ${device} ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@ build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/threshold.cc.o mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +endif ${targetname}.exe: ${srcdir}/test.cpp rm -rf _build diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index cbb5c2e631..1bd250c281 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -14,258 +14,240 @@ from aie.helpers.util import np_ndarray_type_get_shape from aie.helpers.dialects.ext.scf import _for as range_ -width = 512 -height = 9 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -lineWidth = width -lineWidthChannels = width * 4 # 4 channels -tensorSize = width * height - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def color_threshold(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - - # AIE Core Function declarations - thresholdLine = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - - # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] - object_fifo_link( - inOOB_L3L2, - [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], - [], - of_offsets, - ) - - # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty - ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) - object_fifo_link( - [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], - outOOB_L2L3, - of_offsets, - [], - ) - - # Runtime parameters - rtpComputeTile2 = buffer( - ComputeTile2, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile2", - use_write_rtp=True, - ) - rtpComputeTile3 = buffer( - ComputeTile3, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile3", - use_write_rtp=True, - ) - rtpComputeTile4 = buffer( - ComputeTile4, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile4", - use_write_rtp=True, - ) - rtpComputeTile5 = buffer( - ComputeTile5, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile5", - use_write_rtp=True, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence( - np.ndarray[(tensorSize,), np.dtype[np.int8]], - np.ndarray[(32,), np.dtype[np.int32]], # not used - np.ndarray[(tensorSize,), np.dtype[np.int8]], - ) - def sequence(inTensor, notUsed, outTensor): - # thresholdValue, maxValue, thresholdType - rtpComputeTile2[0] = 50 - rtpComputeTile2[1] = 255 - rtpComputeTile2[2] = 0 - - rtpComputeTile3[0] = 50 - rtpComputeTile3[1] = 255 - rtpComputeTile3[2] = 0 - - rtpComputeTile4[0] = 50 - rtpComputeTile4[1] = 255 - rtpComputeTile4[2] = 0 +def color_threshold(dev, width, height): + lineWidth = width + lineWidthChannels = width * 4 # 4 channels + tensorSize = width * height + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + + # AIE Core Function declarations + thresholdLine = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + + # Input RGBA broadcast + memtile for skip + inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty) + inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty) + inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty) + inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty) + inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty) + of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)] + object_fifo_link( + inOOB_L3L2, + [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], + [], + of_offsets, + ) + + # Output RGBA + outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty) + outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty) + outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty) + outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty) + outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty) + object_fifo_link( + [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], + outOOB_L2L3, + of_offsets, + [], + ) + + # Runtime parameters + rtpComputeTile2 = buffer( + ComputeTile2, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile2", + use_write_rtp=True, + ) + rtpComputeTile3 = buffer( + ComputeTile3, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile3", + use_write_rtp=True, + ) + rtpComputeTile4 = buffer( + ComputeTile4, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile4", + use_write_rtp=True, + ) + rtpComputeTile5 = buffer( + ComputeTile5, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile5", + use_write_rtp=True, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - rtpComputeTile5[0] = 50 - rtpComputeTile5[1] = 255 - rtpComputeTile5[2] = 0 + inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - npu_dma_memcpy_nd( - metadata=inOOB_L3L2, - bd_id=1, - mem=inTensor, - sizes=[1, 1, 1, tensorSize], - issue_token=True, + inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, ) - npu_dma_memcpy_nd( - metadata=outOOB_L2L3, - bd_id=0, - mem=outTensor, - sizes=[1, 1, 1, tensorSize], + + inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, ) - dma_wait(inOOB_L3L2, outOOB_L2L3) + inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence( + np.ndarray[(tensorSize,), np.dtype[np.int8]], + np.ndarray[(32,), np.dtype[np.int32]], # not used + np.ndarray[(tensorSize,), np.dtype[np.int8]], + ) + def sequence(inTensor, notUsed, outTensor): + # thresholdValue, maxValue, thresholdType + rtpComputeTile2[0] = 50 + rtpComputeTile2[1] = 255 + rtpComputeTile2[2] = 0 + + rtpComputeTile3[0] = 50 + rtpComputeTile3[1] = 255 + rtpComputeTile3[2] = 0 + + rtpComputeTile4[0] = 50 + rtpComputeTile4[1] = 255 + rtpComputeTile4[2] = 0 + + rtpComputeTile5[0] = 50 + rtpComputeTile5[1] = 255 + rtpComputeTile5[2] = 0 + + npu_dma_memcpy_nd( + metadata=inOOB_L3L2, + bd_id=1, + mem=inTensor, + sizes=[1, 1, 1, tensorSize], + issue_token=True, + ) + npu_dma_memcpy_nd( + metadata=outOOB_L2L3, + bd_id=0, + mem=outTensor, + sizes=[1, 1, 1, tensorSize], + ) + dma_wait(inOOB_L3L2, outOOB_L2L3) + + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 512 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 9 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: # print(ctx.module.operation.verify()) + color_threshold(dev, width, height) print(ctx.module) - - -color_threshold() diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py index fd36516f3b..84e2de3895 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py @@ -14,256 +14,238 @@ from aie.helpers.util import np_ndarray_type_get_shape from aie.helpers.dialects.ext.scf import _for as range_ -width = 512 -height = 9 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -lineWidth = width -lineWidthChannels = width * 4 # 4 channels -tensorSize = width * height - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def color_threshold(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - - # AIE Core Function declarations - thresholdLine = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - - # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] - object_fifo_link( - inOOB_L3L2, - [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], - [], - of_offsets, - ) - - # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty - ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) - object_fifo_link( - [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], - outOOB_L2L3, - of_offsets, - [], - ) - - # Runtime parameters - rtpComputeTile2 = buffer( - ComputeTile2, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile2", - use_write_rtp=True, - ) - rtpComputeTile3 = buffer( - ComputeTile3, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile3", - use_write_rtp=True, - ) - rtpComputeTile4 = buffer( - ComputeTile4, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile4", - use_write_rtp=True, - ) - rtpComputeTile5 = buffer( - ComputeTile5, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile5", - use_write_rtp=True, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence( - np.ndarray[(tensorSize,), np.dtype[np.int8]], - np.ndarray[(32,), np.dtype[np.int32]], # not used - np.ndarray[(tensorSize,), np.dtype[np.int8]], - ) - def sequence(inTensor, notUsed, outTensor): - # thresholdValue, maxValue, thresholdType - rtpComputeTile2[0] = 50 - rtpComputeTile2[1] = 255 - rtpComputeTile2[2] = 0 - - rtpComputeTile3[0] = 50 - rtpComputeTile3[1] = 255 - rtpComputeTile3[2] = 0 - - rtpComputeTile4[0] = 50 - rtpComputeTile4[1] = 255 - rtpComputeTile4[2] = 0 - - rtpComputeTile5[0] = 50 - rtpComputeTile5[1] = 255 - rtpComputeTile5[2] = 0 +def color_threshold(dev, width, height): + lineWidth = width + lineWidthChannels = width * 4 # 4 channels + tensorSize = width * height + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + + # AIE Core Function declarations + thresholdLine = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + + # Input RGBA broadcast + memtile for skip + inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty) + inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty) + inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty) + inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty) + inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty) + of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)] + object_fifo_link( + inOOB_L3L2, + [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], + [], + of_offsets, + ) + + # Output RGBA + outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty) + outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty) + outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty) + outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty) + outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty) + object_fifo_link( + [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], + outOOB_L2L3, + of_offsets, + [], + ) + + # Runtime parameters + rtpComputeTile2 = buffer( + ComputeTile2, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile2", + use_write_rtp=True, + ) + rtpComputeTile3 = buffer( + ComputeTile3, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile3", + use_write_rtp=True, + ) + rtpComputeTile4 = buffer( + ComputeTile4, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile4", + use_write_rtp=True, + ) + rtpComputeTile5 = buffer( + ComputeTile5, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile5", + use_write_rtp=True, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - in_task = shim_dma_single_bd_task( - inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True + inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, ) - out_task = shim_dma_single_bd_task( - outOOB_L2L3, - outTensor, - sizes=[1, 1, 1, tensorSize], - issue_token=True, + + inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, ) - dma_start_task(in_task, out_task) - dma_await_task(in_task, out_task) + inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) + inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence( + np.ndarray[(tensorSize,), np.dtype[np.int8]], + np.ndarray[(32,), np.dtype[np.int32]], # not used + np.ndarray[(tensorSize,), np.dtype[np.int8]], + ) + def sequence(inTensor, notUsed, outTensor): + # thresholdValue, maxValue, thresholdType + rtpComputeTile2[0] = 50 + rtpComputeTile2[1] = 255 + rtpComputeTile2[2] = 0 + + rtpComputeTile3[0] = 50 + rtpComputeTile3[1] = 255 + rtpComputeTile3[2] = 0 + + rtpComputeTile4[0] = 50 + rtpComputeTile4[1] = 255 + rtpComputeTile4[2] = 0 + + rtpComputeTile5[0] = 50 + rtpComputeTile5[1] = 255 + rtpComputeTile5[2] = 0 + + in_task = shim_dma_single_bd_task( + inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True + ) + out_task = shim_dma_single_bd_task( + outOOB_L2L3, + outTensor, + sizes=[1, 1, 1, tensorSize], + issue_token=True, + ) + + dma_start_task(in_task, out_task) + dma_await_task(in_task, out_task) + + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 512 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 9 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: # print(ctx.module.operation.verify()) + color_threshold(dev, width, height) print(ctx.module) - - -color_threshold() diff --git a/programming_examples/vision/color_threshold/run_makefile.lit b/programming_examples/vision/color_threshold/run_makefile.lit index c6e18a3da4..40fc6f201d 100644 --- a/programming_examples/vision/color_threshold/run_makefile.lit +++ b/programming_examples/vision/color_threshold/run_makefile.lit @@ -5,6 +5,5 @@ // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile - // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s - // CHECK: PASS! + // RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/color_threshold/run_makefile_alt.lit b/programming_examples/vision/color_threshold/run_makefile_alt.lit index 9f5617f16c..19bd34a2d0 100644 --- a/programming_examples/vision/color_threshold/run_makefile_alt.lit +++ b/programming_examples/vision/color_threshold/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/color_threshold/run_strix_makefile.lit b/programming_examples/vision/color_threshold/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/vision/color_threshold/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile index e1ed21e0ae..2f6159bd3d 100755 --- a/programming_examples/vision/edge_detect/Makefile +++ b/programming_examples/vision/edge_detect/Makefile @@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu EDGEDETECT_WIDTH = 1920 EDGEDETECT_HEIGHT = 1080 @@ -36,7 +37,13 @@ mlir: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/%.cc.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o mkdir -p ${@D} @@ -44,13 +51,18 @@ build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.c build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@ + python3 $< ${device} ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +endif ${targetname}.exe: ${srcdir}/test.cpp rm -rf _build diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 59d7c030f2..4efc78dbf7 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -12,278 +12,282 @@ from aie.helpers.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx -width = 64 -height = 36 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -heightMinus1 = height - 1 -lineWidth = width -lineWidthInBytes = width * 4 -tensorSize = width * height * 4 # 4 channels - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def edge_detect(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] - - tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] - tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] - - # AIE Core Function declarations - rgba2gray_line = external_func( - "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] - ) - filter2d_line = external_func( - "filter2dLine", - inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], - ) - threshold_line = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - gray2rgba_line = external_func( - "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] - ) - add_weighted_line = external_func( - "addWeightedLine", - inputs=[ - line_bytes_ty, - line_bytes_ty, - line_bytes_ty, - np.int32, - np.int16, - np.int16, - np.int8, - ], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - # Input - inOF_L3L2 = object_fifo( - "inOF_L3L2", - ShimTile, - [ComputeTile2, MemTile], - [2, 2, 7], +def edge_detect(dev, width, height): + heightMinus1 = height - 1 + lineWidth = width + lineWidthInBytes = width * 4 + tensorSize = width * height * 4 # 4 channels + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] + + tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] + tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] + + # AIE Core Function declarations + rgba2gray_line = external_func( + "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] + ) + filter2d_line = external_func( + "filter2dLine", + inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], + ) + threshold_line = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + gray2rgba_line = external_func( + "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] + ) + add_weighted_line = external_func( + "addWeightedLine", + inputs=[ line_bytes_ty, - ) - inOF_L2L1 = object_fifo( - "inOF_L2L1", - MemTile, - ComputeTile5, - 7, - line_bytes_ty, - ) - object_fifo_link(inOF_L3L2, inOF_L2L1) - - # Output - outOF_L2L3 = object_fifo( - "outOF_L2L3", - MemTile, - ShimTile, - 2, line_bytes_ty, - ) - outOF_L1L2 = object_fifo( - "outOF_L1L2", - ComputeTile5, - MemTile, - 2, line_bytes_ty, + np.int32, + np.int16, + np.int16, + np.int8, + ], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + # Input + inOF_L3L2 = object_fifo( + "inOF_L3L2", + ShimTile, + [ComputeTile2, MemTile], + [2, 2, 7], + line_bytes_ty, + ) + inOF_L2L1 = object_fifo( + "inOF_L2L1", + MemTile, + ComputeTile5, + 7, + line_bytes_ty, + ) + object_fifo_link(inOF_L3L2, inOF_L2L1) + + # Output + outOF_L2L3 = object_fifo( + "outOF_L2L3", + MemTile, + ShimTile, + 2, + line_bytes_ty, + ) + outOF_L1L2 = object_fifo( + "outOF_L1L2", + ComputeTile5, + MemTile, + 2, + line_bytes_ty, + ) + object_fifo_link(outOF_L1L2, outOF_L2L3) + + # Intermediate + OF_2to3 = object_fifo( + "OF_2to3", + ComputeTile2, + ComputeTile3, + 4, + line_ty, + ) + OF_3to4 = object_fifo( + "OF_3to4", + ComputeTile3, + ComputeTile4, + 2, + line_ty, + ) + OF_4to5 = object_fifo( + "OF_4to5", + ComputeTile4, + ComputeTile5, + 2, + line_ty, + ) + OF_5to5 = object_fifo( + "OF_5to5", + ComputeTile5, + ComputeTile5, + 1, + line_bytes_ty, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "rgba2gray.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) + + rgba2gray_line(elem_in, elem_out, lineWidth) + + inOF_L3L2.release(ObjectFifoPort.Consume, 1) + OF_2to3.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "filter2d.cc.o") + def core_body(): + v0 = 0 + v1 = 4096 + v_minus4 = -16384 + initial_value = np.array( + [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 ) - object_fifo_link(outOF_L1L2, outOF_L2L3) - - # Intermediate - OF_2to3 = object_fifo( - "OF_2to3", - ComputeTile2, + kernel = buffer( ComputeTile3, - 4, - line_ty, - ) - OF_3to4 = object_fifo( - "OF_3to4", - ComputeTile3, - ComputeTile4, - 2, - line_ty, - ) - OF_4to5 = object_fifo( - "OF_4to5", - ComputeTile4, - ComputeTile5, - 2, - line_ty, + np.ndarray[(3, 3), np.dtype[np.int16]], + "kernel", + initial_value=initial_value, ) - OF_5to5 = object_fifo( - "OF_5to5", - ComputeTile5, - ComputeTile5, - 1, - line_bytes_ty, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "rgba2gray.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) - - rgba2gray_line(elem_in, elem_out, lineWidth) - inOF_L3L2.release(ObjectFifoPort.Consume, 1) - OF_2to3.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "filter2d.cc.o") - def core_body(): - v0 = 0 - v1 = 4096 - v_minus4 = -16384 - initial_value = np.array( - [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 - ) - kernel = buffer( - ComputeTile3, - np.ndarray[(3, 3), np.dtype[np.int16]], - "kernel", - initial_value=initial_value, + for _ in range_(sys.maxsize): + # Preamble : Top Border + elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_pre[0], + elems_in_pre[0], + elems_in_pre[1], + elem_pre_out, + lineWidth, + kernel, ) + OF_3to4.release(ObjectFifoPort.Produce, 1) - for _ in range_(sys.maxsize): - # Preamble : Top Border - elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + # Steady State : Middle + for _ in range_(1, heightMinus1): + elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) + elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) filter2d_line( - elems_in_pre[0], - elems_in_pre[0], - elems_in_pre[1], - elem_pre_out, + elems_in[0], + elems_in[1], + elems_in[2], + elem_out, lineWidth, kernel, ) + OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - # Steady State : Middle - for _ in range_(1, heightMinus1): - elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) - elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in[0], - elems_in[1], - elems_in[2], - elem_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 1) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Postamble : Bottom Border - elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in_post[0], - elems_in_post[1], - elems_in_post[1], - elem_post_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 2) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - v_thr = 10 - v_max = 255 - v_typ = 0 - - for _ in range_(sys.maxsize): - elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) - - threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) - - OF_3to4.release(ObjectFifoPort.Consume, 1) - OF_4to5.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) - - gray2rgba_line(elem_in, elem_out, lineWidth) - - OF_4to5.release(ObjectFifoPort.Consume, 1) - OF_5to5.release(ObjectFifoPort.Produce, 1) - - elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) - elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) - elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) - - alpha = 16384 - beta = 16384 - gamma = 0 - - add_weighted_line( - elem_in1, - elem_in2, - elem_out2, - lineWidthInBytes, - alpha, - beta, - gamma, - ) - - OF_5to5.release(ObjectFifoPort.Consume, 1) - inOF_L2L1.release(ObjectFifoPort.Consume, 1) - outOF_L1L2.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) - def sequence(I, B, O): - npu_dma_memcpy_nd( - metadata=inOF_L3L2, - bd_id=1, - mem=I, - sizes=[1, 1, 1, tensorSize], + # Postamble : Bottom Border + elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_post[0], + elems_in_post[1], + elems_in_post[1], + elem_post_out, + lineWidth, + kernel, ) - npu_dma_memcpy_nd( - metadata=outOF_L2L3, - bd_id=0, - mem=O, - sizes=[1, 1, 1, tensorSize], + OF_2to3.release(ObjectFifoPort.Consume, 2) + OF_3to4.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + v_thr = 10 + v_max = 255 + v_typ = 0 + + for _ in range_(sys.maxsize): + elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) + + threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) + + OF_3to4.release(ObjectFifoPort.Consume, 1) + OF_4to5.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) + + gray2rgba_line(elem_in, elem_out, lineWidth) + + OF_4to5.release(ObjectFifoPort.Consume, 1) + OF_5to5.release(ObjectFifoPort.Produce, 1) + + elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) + elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) + + alpha = 16384 + beta = 16384 + gamma = 0 + + add_weighted_line( + elem_in1, + elem_in2, + elem_out2, + lineWidthInBytes, + alpha, + beta, + gamma, ) - # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all - dma_wait(outOF_L2L3) - # print(ctx.module.operation.verify()) + OF_5to5.release(ObjectFifoPort.Consume, 1) + inOF_L2L1.release(ObjectFifoPort.Consume, 1) + outOF_L1L2.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) + def sequence(I, B, O): + npu_dma_memcpy_nd( + metadata=inOF_L3L2, + bd_id=1, + mem=I, + sizes=[1, 1, 1, tensorSize], + ) + npu_dma_memcpy_nd( + metadata=outOF_L2L3, + bd_id=0, + mem=O, + sizes=[1, 1, 1, tensorSize], + ) + # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all + dma_wait(outOF_L2L3) + + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 36 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 64 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + # print(ctx.module.operation.verify()) + edge_detect(dev, width, height) print(ctx.module) - - -edge_detect() diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py index bbbdc586b6..75a34e5533 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py @@ -12,277 +12,279 @@ from aie.helpers.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx -width = 64 -height = 36 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -heightMinus1 = height - 1 -lineWidth = width -lineWidthInBytes = width * 4 -tensorSize = width * height * 4 # 4 channels - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def edge_detect(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] - - tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] - tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] - - # AIE Core Function declarations - rgba2gray_line = external_func( - "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] - ) - filter2d_line = external_func( - "filter2dLine", - inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], - ) - threshold_line = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - gray2rgba_line = external_func( - "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] - ) - add_weighted_line = external_func( - "addWeightedLine", - inputs=[ - line_bytes_ty, - line_bytes_ty, - line_bytes_ty, - np.int32, - np.int16, - np.int16, - np.int8, - ], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - # Input - inOF_L3L2 = object_fifo( - "inOF_L3L2", - ShimTile, - [ComputeTile2, MemTile], - [2, 2, 7], +def edge_detect(dev, width, height): + heightMinus1 = height - 1 + lineWidth = width + lineWidthInBytes = width * 4 + tensorSize = width * height * 4 # 4 channels + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] + + tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] + tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] + + # AIE Core Function declarations + rgba2gray_line = external_func( + "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] + ) + filter2d_line = external_func( + "filter2dLine", + inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], + ) + threshold_line = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + gray2rgba_line = external_func( + "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] + ) + add_weighted_line = external_func( + "addWeightedLine", + inputs=[ line_bytes_ty, - ) - inOF_L2L1 = object_fifo( - "inOF_L2L1", - MemTile, - ComputeTile5, - 7, - line_bytes_ty, - ) - object_fifo_link(inOF_L3L2, inOF_L2L1) - - # Output - outOF_L2L3 = object_fifo( - "outOF_L2L3", - MemTile, - ShimTile, - 2, line_bytes_ty, - ) - outOF_L1L2 = object_fifo( - "outOF_L1L2", - ComputeTile5, - MemTile, - 2, line_bytes_ty, + np.int32, + np.int16, + np.int16, + np.int8, + ], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + # Input + inOF_L3L2 = object_fifo( + "inOF_L3L2", + ShimTile, + [ComputeTile2, MemTile], + [2, 2, 7], + line_bytes_ty, + ) + inOF_L2L1 = object_fifo( + "inOF_L2L1", + MemTile, + ComputeTile5, + 7, + line_bytes_ty, + ) + object_fifo_link(inOF_L3L2, inOF_L2L1) + + # Output + outOF_L2L3 = object_fifo( + "outOF_L2L3", + MemTile, + ShimTile, + 2, + line_bytes_ty, + ) + outOF_L1L2 = object_fifo( + "outOF_L1L2", + ComputeTile5, + MemTile, + 2, + line_bytes_ty, + ) + object_fifo_link(outOF_L1L2, outOF_L2L3) + + # Intermediate + OF_2to3 = object_fifo( + "OF_2to3", + ComputeTile2, + ComputeTile3, + 4, + line_ty, + ) + OF_3to4 = object_fifo( + "OF_3to4", + ComputeTile3, + ComputeTile4, + 2, + line_ty, + ) + OF_4to5 = object_fifo( + "OF_4to5", + ComputeTile4, + ComputeTile5, + 2, + line_ty, + ) + OF_5to5 = object_fifo( + "OF_5to5", + ComputeTile5, + ComputeTile5, + 1, + line_bytes_ty, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "rgba2gray.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) + + rgba2gray_line(elem_in, elem_out, lineWidth) + + inOF_L3L2.release(ObjectFifoPort.Consume, 1) + OF_2to3.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "filter2d.cc.o") + def core_body(): + v0 = 0 + v1 = 4096 + v_minus4 = -16384 + initial_value = np.array( + [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 ) - object_fifo_link(outOF_L1L2, outOF_L2L3) - - # Intermediate - OF_2to3 = object_fifo( - "OF_2to3", - ComputeTile2, + kernel = buffer( ComputeTile3, - 4, - line_ty, - ) - OF_3to4 = object_fifo( - "OF_3to4", - ComputeTile3, - ComputeTile4, - 2, - line_ty, - ) - OF_4to5 = object_fifo( - "OF_4to5", - ComputeTile4, - ComputeTile5, - 2, - line_ty, + np.ndarray[(3, 3), np.dtype[np.int16]], + "kernel", + initial_value=initial_value, ) - OF_5to5 = object_fifo( - "OF_5to5", - ComputeTile5, - ComputeTile5, - 1, - line_bytes_ty, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "rgba2gray.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) - - rgba2gray_line(elem_in, elem_out, lineWidth) - inOF_L3L2.release(ObjectFifoPort.Consume, 1) - OF_2to3.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "filter2d.cc.o") - def core_body(): - v0 = 0 - v1 = 4096 - v_minus4 = -16384 - initial_value = np.array( - [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 - ) - kernel = buffer( - ComputeTile3, - np.ndarray[(3, 3), np.dtype[np.int16]], - "kernel", - initial_value=initial_value, + for _ in range_(sys.maxsize): + # Preamble : Top Border + elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_pre[0], + elems_in_pre[0], + elems_in_pre[1], + elem_pre_out, + lineWidth, + kernel, ) + OF_3to4.release(ObjectFifoPort.Produce, 1) - for _ in range_(sys.maxsize): - # Preamble : Top Border - elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + # Steady State : Middle + for _ in range_(1, heightMinus1): + elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) + elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) filter2d_line( - elems_in_pre[0], - elems_in_pre[0], - elems_in_pre[1], - elem_pre_out, + elems_in[0], + elems_in[1], + elems_in[2], + elem_out, lineWidth, kernel, ) + OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - # Steady State : Middle - for _ in range_(1, heightMinus1): - elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) - elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in[0], - elems_in[1], - elems_in[2], - elem_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 1) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Postamble : Bottom Border - elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in_post[0], - elems_in_post[1], - elems_in_post[1], - elem_post_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 2) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - v_thr = 10 - v_max = 255 - v_typ = 0 - - for _ in range_(sys.maxsize): - elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) - - threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) - - OF_3to4.release(ObjectFifoPort.Consume, 1) - OF_4to5.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) - - gray2rgba_line(elem_in, elem_out, lineWidth) - - OF_4to5.release(ObjectFifoPort.Consume, 1) - OF_5to5.release(ObjectFifoPort.Produce, 1) - - elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) - elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) - elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) - - alpha = 16384 - beta = 16384 - gamma = 0 - - add_weighted_line( - elem_in1, - elem_in2, - elem_out2, - lineWidthInBytes, - alpha, - beta, - gamma, - ) - - OF_5to5.release(ObjectFifoPort.Consume, 1) - inOF_L2L1.release(ObjectFifoPort.Consume, 1) - outOF_L1L2.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) - def sequence(I, B, O): - in_task = shim_dma_single_bd_task( - inOF_L3L2, I, sizes=[1, 1, 1, tensorSize] + # Postamble : Bottom Border + elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_post[0], + elems_in_post[1], + elems_in_post[1], + elem_post_out, + lineWidth, + kernel, ) - out_task = shim_dma_single_bd_task( - outOF_L2L3, - O, - sizes=[1, 1, 1, tensorSize], - issue_token=True, + OF_2to3.release(ObjectFifoPort.Consume, 2) + OF_3to4.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + v_thr = 10 + v_max = 255 + v_typ = 0 + + for _ in range_(sys.maxsize): + elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) + + threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) + + OF_3to4.release(ObjectFifoPort.Consume, 1) + OF_4to5.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) + + gray2rgba_line(elem_in, elem_out, lineWidth) + + OF_4to5.release(ObjectFifoPort.Consume, 1) + OF_5to5.release(ObjectFifoPort.Produce, 1) + + elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) + elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) + + alpha = 16384 + beta = 16384 + gamma = 0 + + add_weighted_line( + elem_in1, + elem_in2, + elem_out2, + lineWidthInBytes, + alpha, + beta, + gamma, ) - dma_start_task(in_task, out_task) - dma_await_task(out_task) - dma_free_task(in_task) + OF_5to5.release(ObjectFifoPort.Consume, 1) + inOF_L2L1.release(ObjectFifoPort.Consume, 1) + outOF_L1L2.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) + def sequence(I, B, O): + in_task = shim_dma_single_bd_task(inOF_L3L2, I, sizes=[1, 1, 1, tensorSize]) + out_task = shim_dma_single_bd_task( + outOF_L2L3, + O, + sizes=[1, 1, 1, tensorSize], + issue_token=True, + ) - # print(ctx.module.operation.verify()) + dma_start_task(in_task, out_task) + dma_await_task(out_task) + dma_free_task(in_task) + + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 36 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 64 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + # print(ctx.module.operation.verify()) + edge_detect(dev, width, height) print(ctx.module) - - -edge_detect() diff --git a/programming_examples/vision/edge_detect/run_makefile.lit b/programming_examples/vision/edge_detect/run_makefile.lit index c6e18a3da4..2368db78ff 100644 --- a/programming_examples/vision/edge_detect/run_makefile.lit +++ b/programming_examples/vision/edge_detect/run_makefile.lit @@ -5,6 +5,4 @@ // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile - // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s - // CHECK: PASS! - \ No newline at end of file + // RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/edge_detect/run_makefile_alt.lit b/programming_examples/vision/edge_detect/run_makefile_alt.lit index 9f5617f16c..19bd34a2d0 100644 --- a/programming_examples/vision/edge_detect/run_makefile_alt.lit +++ b/programming_examples/vision/edge_detect/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/edge_detect/run_strix_makefile.lit b/programming_examples/vision/edge_detect/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/vision/edge_detect/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2