From 1b3af9fc1b372d41ed9335f2b2d9055e37672bcf Mon Sep 17 00:00:00 2001 From: Jack Lo <36210336+jackl-xilinx@users.noreply.github.com> Date: Wed, 13 Mar 2024 11:24:29 -0700 Subject: [PATCH] ReLU with tracing (#1204) ReLU example with tracing Co-authored-by: pjr Co-authored-by: Joseph Melber Ml eltwise add and mul (#1207) Move around of the eltwise add (put it in ml) and a new eltwise mul kernel Co-authored-by: pjr Co-authored-by: Jeff Fifield Moved test_lib to runtime_lib/test_lib for now Pjr reduce (#1222) Reduce programming examples Co-authored-by: pjr Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> [ASPLOS][WIP] Passthrough kernel in basic examples (#1216) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> fix paths run.lit passthrough _kernel (#1225) Fixed CMakeLists.txt reference to test_utils.h (#1223) Minor CMakeLists.txt and Makefile fixes for programming_examples (#1227) --- aie_kernels/aie2/add.cc | 61 ++++ .../aie_generic}/passThrough.cc | 17 +- aie_kernels/generic/vector_max.cc | 53 ++++ aie_kernels/generic/vector_min.cc | 53 ++++ aie_kernels/relu.cc | 41 +++ .../basic/eltwise_exp/Makefile | 53 ++++ .../basic/eltwise_mul/CMakeLists.txt | 8 +- .../basic/eltwise_mul/Makefile | 7 +- .../basic/eltwise_mul/test.py | 2 +- .../basic/log_hello_world/CMakeLists.txt | 7 + .../basic/log_hello_world/Makefile | 2 +- .../matrix_multiplication/CMakeLists.txt | 7 + .../matrix_multiplication/makefile-common | 18 +- .../matrix_vector/Makefile | 4 +- .../single_core/Makefile | 3 +- .../basic/passthrough_dmas/CMakeLists.txt | 75 +++++ .../Makefile | 12 +- .../aie2.py | 6 +- .../run.lit | 2 +- .../run_vck5000.lit | 0 .../test.cpp | 0 .../test_vck5000.cpp | 0 .../basic/passthrough_kernel/CMakeLists.txt | 80 ++++++ .../basic/passthrough_kernel/Makefile | 49 ++++ .../basic/passthrough_kernel/aie2.py | 170 ++++++++++++ .../basic/passthrough_kernel/run.lit | 12 + .../basic/passthrough_kernel/test.cpp | 118 ++++++++ .../basic/relu/CMakeLists.txt | 7 + programming_examples/basic/relu/Makefile | 13 +- .../basic/vector_add_reduce/CMakeLists.txt | 69 +++++ .../basic/vector_add_reduce/Makefile | 76 +++++ .../basic/vector_add_reduce/aie2.py | 90 ++++++ .../basic/vector_add_reduce/run.lit | 13 + .../basic/vector_add_reduce/run_vck5000.lit | 9 + .../basic/vector_add_reduce/test.cpp | 261 ++++++++++++++++++ .../basic/vector_add_reduce/test_vck5000.cpp | 146 ++++++++++ .../basic/vector_bias_add/CMakeLists.txt | 7 + .../basic/vector_bias_add/Makefile | 2 +- .../basic/vector_exp/CMakeLists.txt | 7 + .../basic/vector_exp/Makefile | 10 +- .../basic/vector_max/CMakeLists.txt | 1 + .../basic/vector_max/Makefile | 67 +++-- programming_examples/basic/vector_max/aie2.py | 43 ++- programming_examples/basic/vector_max/run.lit | 2 + .../basic/vector_max/test.cpp | 261 +++++++++++------- .../CMakeLists.txt | 3 +- .../basic/vector_max_reduce/Makefile | 76 +++++ .../basic/vector_max_reduce/README.md | 29 ++ .../basic/vector_max_reduce/aie2.py | 86 ++++++ .../basic/vector_max_reduce/run.lit | 13 + .../basic/vector_max_reduce/run_vck5000.lit | 9 + .../basic/vector_max_reduce/test.cpp | 261 ++++++++++++++++++ .../basic/vector_max_reduce/test_vck5000.cpp | 146 ++++++++++ .../basic/vector_min/CMakeLists.txt | 1 + .../basic/vector_min/Makefile | 67 +++-- .../basic/vector_min/Makefile.pjr | 67 +++++ .../basic/vector_min/README.md | 4 +- programming_examples/basic/vector_min/aie2.py | 43 ++- programming_examples/basic/vector_min/run.lit | 2 + .../basic/vector_min/run_vck5000.lit | 2 +- .../basic/vector_min/test.cpp | 261 +++++++++++------- .../basic/vector_min_reduce/CMakeLists.txt | 69 +++++ .../basic/vector_min_reduce/Makefile | 76 +++++ .../basic/vector_min_reduce/README.md | 29 ++ .../basic/vector_min_reduce/aie2.py | 86 ++++++ .../basic/vector_min_reduce/run.lit | 13 + .../basic/vector_min_reduce/run_vck5000.lit | 9 + .../basic/vector_min_reduce/test.cpp | 261 ++++++++++++++++++ .../basic/vector_scalar/CMakeLists.txt | 7 + .../basic/vector_scalar/Makefile | 2 +- .../basic/vector_softmax/CMakeLists.txt | 7 + .../basic/vector_softmax/Makefile | 10 +- .../basic/vector_sum/CMakeLists.txt | 9 + .../basic/weight_expand/CMakeLists.txt | 7 + .../basic/weight_expand/Makefile | 8 +- .../{basic => }/makefile-common | 0 .../ml/eltwise_add/CMakeLists.txt | 8 +- programming_examples/ml/eltwise_add/Makefile | 7 +- programming_examples/utils/makefile-common | 22 -- .../vision/color_detect/CMakeLists.txt | 10 +- .../vision/color_detect/Makefile | 3 +- .../vision/color_detect/test.cpp | 5 +- .../vision/color_threshold/CMakeLists.txt | 10 +- .../vision/color_threshold/Makefile | 2 +- .../vision/color_threshold/test.cpp | 18 +- .../vision/edge_detect/CMakeLists.txt | 10 +- .../vision/edge_detect/Makefile | 2 +- .../vision/edge_detect/test.cpp | 5 +- .../vision/passthrough/CMakeLists.txt | 12 +- .../vision/passthrough/Makefile | 4 +- .../vision/passthrough/run.lit | 2 +- .../vision/passthrough/test.cpp | 5 +- .../utils => python}/test_utils.py | 1 - .../test_lib/test_utils.cpp | 160 +++++------ runtime_lib/test_lib/test_utils.h | 119 ++++++++ .../matrix_multiplication_using_dma/run.lit | 1 - 96 files changed, 3496 insertions(+), 517 deletions(-) create mode 100755 aie_kernels/aie2/add.cc rename {programming_examples/vision/vision_kernels => aie_kernels/aie_generic}/passThrough.cc (84%) create mode 100644 aie_kernels/generic/vector_max.cc create mode 100644 aie_kernels/generic/vector_min.cc create mode 100644 aie_kernels/relu.cc create mode 100644 programming_examples/basic/eltwise_exp/Makefile create mode 100644 programming_examples/basic/passthrough_dmas/CMakeLists.txt rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/Makefile (93%) mode change 100755 => 100644 rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/aie2.py (92%) rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/run.lit (94%) rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/run_vck5000.lit (100%) rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/test.cpp (100%) rename programming_examples/basic/{passthrough_hardware => passthrough_dmas}/test_vck5000.cpp (100%) create mode 100644 programming_examples/basic/passthrough_kernel/CMakeLists.txt create mode 100644 programming_examples/basic/passthrough_kernel/Makefile create mode 100644 programming_examples/basic/passthrough_kernel/aie2.py create mode 100644 programming_examples/basic/passthrough_kernel/run.lit create mode 100644 programming_examples/basic/passthrough_kernel/test.cpp create mode 100644 programming_examples/basic/vector_add_reduce/CMakeLists.txt create mode 100644 programming_examples/basic/vector_add_reduce/Makefile create mode 100644 programming_examples/basic/vector_add_reduce/aie2.py create mode 100644 programming_examples/basic/vector_add_reduce/run.lit create mode 100644 programming_examples/basic/vector_add_reduce/run_vck5000.lit create mode 100644 programming_examples/basic/vector_add_reduce/test.cpp create mode 100644 programming_examples/basic/vector_add_reduce/test_vck5000.cpp rename programming_examples/basic/{passthrough_hardware => vector_max_reduce}/CMakeLists.txt (96%) create mode 100755 programming_examples/basic/vector_max_reduce/Makefile create mode 100644 programming_examples/basic/vector_max_reduce/README.md create mode 100755 programming_examples/basic/vector_max_reduce/aie2.py create mode 100644 programming_examples/basic/vector_max_reduce/run.lit create mode 100644 programming_examples/basic/vector_max_reduce/run_vck5000.lit create mode 100644 programming_examples/basic/vector_max_reduce/test.cpp create mode 100644 programming_examples/basic/vector_max_reduce/test_vck5000.cpp create mode 100755 programming_examples/basic/vector_min/Makefile.pjr create mode 100644 programming_examples/basic/vector_min_reduce/CMakeLists.txt create mode 100755 programming_examples/basic/vector_min_reduce/Makefile create mode 100644 programming_examples/basic/vector_min_reduce/README.md create mode 100755 programming_examples/basic/vector_min_reduce/aie2.py create mode 100644 programming_examples/basic/vector_min_reduce/run.lit create mode 100644 programming_examples/basic/vector_min_reduce/run_vck5000.lit create mode 100644 programming_examples/basic/vector_min_reduce/test.cpp rename programming_examples/{basic => }/makefile-common (100%) delete mode 100644 programming_examples/utils/makefile-common rename {programming_examples/utils => python}/test_utils.py (99%) rename programming_examples/utils/test_utils.h => runtime_lib/test_lib/test_utils.cpp (53%) create mode 100644 runtime_lib/test_lib/test_utils.h diff --git a/aie_kernels/aie2/add.cc b/aie_kernels/aie2/add.cc new file mode 100755 index 0000000000..75a0552ec8 --- /dev/null +++ b/aie_kernels/aie2/add.cc @@ -0,0 +1,61 @@ +//===- scale.cc -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define __AIENGINE__ 2 +#define NOCPP +#define __AIEARCH__ 20 + +#include +#include +#include +#include + +#include + +template +void eltwise_add(T_in *a, T_in *b, T_out *c) { + for (int i = 0; i < N; i++) { + c[i] = a[i] + b[i]; + } +} + +template +void eltwise_vadd(T_in *a, T_in *b, T_out *c) { + + constexpr int vec_factor = 16; + event0(); + T_in *__restrict pA1 = a; + T_in *__restrict pB1 = b; + T_out *__restrict pC1 = c; + const int F = N / vec_factor; + for (int i = 0; i < F; i++) + chess_prepare_for_pipelining chess_loop_range(16, ) { + aie::vector A0 = aie::load_v(pA1); + pA1 += vec_factor; + aie::vector B0 = aie::load_v(pB1); + pB1 += vec_factor; + aie::vector cout = aie::add(A0, B0); + aie::store_v(pC1, cout); + pC1 += vec_factor; + } + event1(); +} + +extern "C" { + +void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) { + eltwise_add(a_in, b_in, c_out); +} + +void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) { + eltwise_vadd(a_in, b_in, c_out); +} + +} // extern "C" diff --git a/programming_examples/vision/vision_kernels/passThrough.cc b/aie_kernels/aie_generic/passThrough.cc similarity index 84% rename from programming_examples/vision/vision_kernels/passThrough.cc rename to aie_kernels/aie_generic/passThrough.cc index 0928af33f0..1de33066d1 100644 --- a/programming_examples/vision/vision_kernels/passThrough.cc +++ b/aie_kernels/aie_generic/passThrough.cc @@ -15,30 +15,21 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include template __attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, const int32_t height, const int32_t width) { - //::aie::vector data_out; - //::aie::mask temp_val; + event0(); + v64uint8 *restrict outPtr = (v64uint8 *)out; v64uint8 *restrict inPtr = (v64uint8 *)in; for (int j = 0; j < (height * width); j += N) // Nx samples per loop - chess_prepare_for_pipelining chess_loop_range(6, ) { - //::aie::vector tmpVector = ::aie::load_v(in); - //::aie::store_v(out, tmpVector); - - *outPtr++ = *inPtr++; + chess_prepare_for_pipelining chess_loop_range(6, ) { *outPtr++ = *inPtr++; } - // in += N; - // out += N; - } + event1(); } extern "C" { diff --git a/aie_kernels/generic/vector_max.cc b/aie_kernels/generic/vector_max.cc new file mode 100644 index 0000000000..f94646ff0c --- /dev/null +++ b/aie_kernels/generic/vector_max.cc @@ -0,0 +1,53 @@ +#include +#include +#include +#include + +#include + +void vector(int32_t *restrict in, int32_t *restrict out) { + + v16int32 tiny = broadcast_to_v16int32((int32_t)-2147483648); + int32_t input_size = 1024; + int32_t vector_size = 16; + v16int32 after_vector; + v16int32 running_max = tiny; + for (int32_t i = 0; i < input_size; i += vector_size) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16int32 next = *(v16int32 *)(in + i); + v16int32 test = max(running_max, next); + running_max = test; + } + after_vector = running_max; + v16int32 first = shift_bytes(after_vector, after_vector, 32); + v16int32 second = max(after_vector, first); + v16int32 second_shift = shift_bytes(second, second, 16); + v16int32 third = max(second, second_shift); + v16int32 third_shift = shift_bytes(third, third, 8); + v16int32 fourth = max(third, third_shift); + v16int32 fourth_shift = shift_bytes(fourth, fourth, 4); + v16int32 fifth = max(fourth, fourth_shift); + int32_t last = extract_elem(fifth, 0); + *(int32_t *)out = last; + return; +} + +void scalar(int32_t *restrict in, int32_t *restrict out) { + size_t input_size = 1024; + int32_t running_max = (int32_t)-2147483648; + for (int32_t i = 0; i < input_size; i++) { + if (in[i] > running_max) + running_max = in[i]; + } + *(int32_t *)out = running_max; + + return; +} + +extern "C" { + +void vector_max(int32_t *a_in, int32_t *c_out) { vector(a_in, c_out); } + +void scalar_max(int32_t *a_in, int32_t *c_out) { scalar(a_in, c_out); } + +} // extern "C" diff --git a/aie_kernels/generic/vector_min.cc b/aie_kernels/generic/vector_min.cc new file mode 100644 index 0000000000..c60ca574a1 --- /dev/null +++ b/aie_kernels/generic/vector_min.cc @@ -0,0 +1,53 @@ +#include +#include +#include +#include + +#include + +void vector(int32_t *restrict in, int32_t *restrict out) { + + v16int32 massive = broadcast_to_v16int32((int32_t)2147483647); + int32_t input_size = 1024; + int32_t vector_size = 16; + v16int32 after_vector; + v16int32 running_min = massive; + for (int32_t i = 0; i < input_size; i += vector_size) + chess_prepare_for_pipelining chess_loop_range(64, 64) { + v16int32 next = *(v16int32 *)(in + i); + v16int32 test = min(running_min, next); + running_min = test; + } + after_vector = running_min; + v16int32 first = shift_bytes(after_vector, after_vector, 32); + v16int32 second = min(after_vector, first); + v16int32 second_shift = shift_bytes(second, second, 16); + v16int32 third = min(second, second_shift); + v16int32 third_shift = shift_bytes(third, third, 8); + v16int32 fourth = min(third, third_shift); + v16int32 fourth_shift = shift_bytes(fourth, fourth, 4); + v16int32 fifth = min(fourth, fourth_shift); + int32_t last = extract_elem(fifth, 0); + *(int32_t *)out = last; + return; +} + +void scalar(int32_t *restrict in, int32_t *restrict out) { + size_t input_size = 1024; + int32_t running_min = (int32_t)2147483647; + for (int32_t i = 0; i < input_size; i++) { + if (in[i] < running_min) + running_min = in[i]; + } + *(int32_t *)out = running_min; + + return; +} + +extern "C" { + +void vector_min(int32_t *a_in, int32_t *c_out) { vector(a_in, c_out); } + +void scalar_min(int32_t *a_in, int32_t *c_out) { scalar(a_in, c_out); } + +} // extern "C" diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc new file mode 100644 index 0000000000..a2e87cffc4 --- /dev/null +++ b/aie_kernels/relu.cc @@ -0,0 +1,41 @@ +//===- scale.cc -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define __AIENGINE__ 2 +#define NOCPP +#define __AIEARCH__ 20 + +#include +#include +#include +#include + +#include + +void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) { + const int v_factor = 32; + v32bfloat16 zeroes = broadcast_zero_bfloat16(); + + event0(); + for (size_t i = 0; i < TILE_SIZE; i += v_factor) + chess_prepare_for_pipelining chess_loop_range(32, 32) { + v32bfloat16 input = *(v32bfloat16 *)(a + i); + v32bfloat16 output = max(input, zeroes); + *(v32bfloat16 *)(c + i) = output; + } + event1(); + return; +} + +extern "C" { + +void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); } + +} // extern "C" diff --git a/programming_examples/basic/eltwise_exp/Makefile b/programming_examples/basic/eltwise_exp/Makefile new file mode 100644 index 0000000000..167ac5d741 --- /dev/null +++ b/programming_examples/basic/eltwise_exp/Makefile @@ -0,0 +1,53 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +all: build/final.xclbin + +targetname = eltwise_exp + +build/lut_based_ops.o: + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} + +build/exp.o: + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I ../../../../aie_runtime_lib/AIE2 -c ../../../../aie_kernels/aie2/bf16_exp.cc -o ${@F} + +build/kernels.a: build/exp.o build/lut_based_ops.o + ar rvs $@ $+ + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< > $@ + +build/final.xclbin: build/aie.mlir build/kernels.a + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ $@ @@ -71,4 +69,4 @@ vck5000: build/aie.mlir clean: - rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe + rm -rf build _build inst ${targetname}.exe diff --git a/programming_examples/basic/passthrough_hardware/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py similarity index 92% rename from programming_examples/basic/passthrough_hardware/aie2.py rename to programming_examples/basic/passthrough_dmas/aie2.py index b2f30745b9..1f4b4882ba 100755 --- a/programming_examples/basic/passthrough_hardware/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -14,7 +14,6 @@ from aie.extras.context import mlir_mod_ctx N = 4096 -N_in_bytes = N * 4 # Deciphering the command line arguments @@ -54,9 +53,8 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - tmp = memref.alloc(1, T.i32()) - v0 = arith.constant(0, T.i32()) - memref.store(v0, tmp, [0]) + for _ in for_(sys.maxsize): + yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/passthrough_hardware/run.lit b/programming_examples/basic/passthrough_dmas/run.lit similarity index 94% rename from programming_examples/basic/passthrough_hardware/run.lit rename to programming_examples/basic/passthrough_dmas/run.lit index 62d66040ff..a4f5d568b6 100644 --- a/programming_examples/basic/passthrough_hardware/run.lit +++ b/programming_examples/basic/passthrough_dmas/run.lit @@ -1,7 +1,7 @@ // (c) Copyright 2023 Advanced Micro Devices, Inc. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// REQUIRES: ryzen_ai, chess +// REQUIRES: ryzen_ai // // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir diff --git a/programming_examples/basic/passthrough_hardware/run_vck5000.lit b/programming_examples/basic/passthrough_dmas/run_vck5000.lit similarity index 100% rename from programming_examples/basic/passthrough_hardware/run_vck5000.lit rename to programming_examples/basic/passthrough_dmas/run_vck5000.lit diff --git a/programming_examples/basic/passthrough_hardware/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp similarity index 100% rename from programming_examples/basic/passthrough_hardware/test.cpp rename to programming_examples/basic/passthrough_dmas/test.cpp diff --git a/programming_examples/basic/passthrough_hardware/test_vck5000.cpp b/programming_examples/basic/passthrough_dmas/test_vck5000.cpp similarity index 100% rename from programming_examples/basic/passthrough_hardware/test_vck5000.cpp rename to programming_examples/basic/passthrough_dmas/test_vck5000.cpp diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt new file mode 100644 index 0000000000..8eba23849f --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt @@ -0,0 +1,80 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Xilinx Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif () + +set(PASSTHROUGH_SIZE 4096 CACHE STRING "size") +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} +${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC + PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE} + DISABLE_ABI_CHECK=1 + ) + +target_include_directories (${currentTarget} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile new file mode 100644 index 0000000000..fe7daabd4b --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -0,0 +1,49 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +VPATH := ../../../aie_kernels/aie_generic + +PASSTHROUGH_SIZE = 4096 + +targetname = passThroughKernel + +.PHONY: all template clean + +all: build/final_${PASSTHROUGH_SIZE}.xclbin + +build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${PASSTHROUGH_SIZE} > $@ + +build/passThrough.cc.o: passThrough.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + +build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt + ${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE + +clean: + rm -rf build _build ${targetname}.exe diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py new file mode 100644 index 0000000000..b401f5801f --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -0,0 +1,170 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + +N = 1024 + +if len(sys.argv) == 2: + N = int(sys.argv[1]) + +lineWidthInBytes = N // 4 # chop input in 4 sub-tensors +lineWidthInInt32s = lineWidthInBytes // 4 + +enableTrace = False +traceSizeInBytes = 8192 +traceSizeInInt32s = traceSizeInBytes // 4 + + +def passthroughKernel(): + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + # define types + memRef_ty = T.memref(lineWidthInBytes, T.ui8()) + + # AIE Core Function declarations + passThroughLine = external_func( + "passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()] + ) + + # Tile declarations + ShimTile = tile(0, 0) + ComputeTile2 = tile(0, 2) + + if enableTrace: + flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "passThrough.cc.o") + def core_body(): + for _ in for_(sys.maxsize): + elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) + elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) + call(passThroughLine, [elemIn, elemOut, lineWidthInBytes]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # print(ctx.module.operation.verify()) + + tensorSize = N + tensorSizeInInt32s = tensorSize // 4 + tensor_ty = T.memref(lineWidthInInt32s, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) + def sequence(inTensor, outTensor, notUsed): + if enableTrace: + # Trace output + + # Trace_Event0, Trace_Event1: Select which events to trace. + # Note that the event buffers only appear to be transferred to DDR in + # bursts of 256 bytes. If less than 256 bytes are written, you may not + # see trace output, or only see it on the next iteration of your + # kernel invocation, as the buffer gets filled up. Note that, even + # though events are encoded as 4 byte words, it may take more than 64 + # events to fill the buffer to 256 bytes and cause a flush, since + # multiple repeating events can be 'compressed' by the trace mechanism. + # In order to always generate sufficient events, we add the "assert + # TRUE" event to one slot, which fires every cycle, and thus fills our + # buffer quickly. + + # Some events: + # TRUE (0x01) + # STREAM_STALL (0x18) + # LOCK_STALL (0x1A) + # EVENTS_CORE_INSTR_EVENT_1 (0x22) + # EVENTS_CORE_INSTR_EVENT_0 (0x21) + # INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction + # INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction + # INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction + # EVENTS_CORE_PORT_RUNNING_1 (0x4F) + # EVENTS_CORE_PORT_RUNNING_0 (0x4B) + + # Trace_Event0 (4 slots) + IpuWrite32(0, 2, 0x340E0, 0x4B222125) + # Trace_Event1 (4 slots) + IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F) + + # Event slots as configured above: + # 0: Kernel executes vector instruction + # 1: Event 0 -- Kernel starts + # 2: Event 1 -- Kernel done + # 3: Port_Running_0 + # 4: Port_Running_1 + # 5: Lock Stall + # 6: Lock Acquire Instr + # 7: Lock Release Instr + + # Stream_Switch_Event_Port_Selection_0 + # This is necessary to capture the Port_Running_0 and Port_Running_1 events + IpuWrite32(0, 2, 0x3FF00, 0x121) + + # Trace_Control0: Define trace start and stop triggers. Set start event TRUE. + IpuWrite32(0, 2, 0x340D0, 0x10000) + + # Start trace copy out. + IpuWriteBdShimTile( + bd_id=3, + buffer_length=traceSizeInBytes, + buffer_offset=tensorSize, + enable_packet=0, + out_of_order_id=0, + packet_id=0, + packet_type=0, + column=0, + column_num=1, + d0_stride=0, + d0_wrap=0, + d1_stride=0, + d1_wrap=0, + d2_stride=0, + ddr_id=2, + iteration_current=0, + iteration_stride=0, + iteration_wrap=0, + lock_acq_enable=0, + lock_acq_id=0, + lock_acq_val=0, + lock_rel_id=0, + lock_rel_val=0, + next_bd=0, + use_next_bd=0, + valid_bd=1, + ) + IpuWrite32(0, 0, 0x1D20C, 0x3) + + ipu_dma_memcpy_nd( + metadata="in", + bd_id=0, + mem=inTensor, + sizes=[1, 1, 1, tensorSizeInInt32s], + ) + ipu_dma_memcpy_nd( + metadata="out", + bd_id=1, + mem=outTensor, + sizes=[1, 1, 1, tensorSizeInInt32s], + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +passthroughKernel() diff --git a/programming_examples/basic/passthrough_kernel/run.lit b/programming_examples/basic/passthrough_kernel/run.lit new file mode 100644 index 0000000000..ba521da236 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/run.lit @@ -0,0 +1,12 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o +// RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp new file mode 100644 index 0000000000..7b8779ca13 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/test.cpp @@ -0,0 +1,118 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_utils.h" +#include "xrt/xrt_bo.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using DATATYPE = std::uint8_t; +#endif + +namespace po = boost::program_options; + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + test_utils::parse_options(argc, argv, desc, vm); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT context and load the kernel + xrt::device device; + xrt::kernel kernel; + + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); + + // set up the buffer objects + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_out = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize buffer bo_inA + DATATYPE *bufInA = bo_inA.map(); + for (int i = 0; i < PASSTHROUGH_SIZE; i++) + bufInA[i] = i; + + // Zero out buffer bo_out + DATATYPE *bufOut = bo_out.map(); + memset(bufOut, 0, PASSTHROUGH_SIZE * sizeof(DATATYPE)); + + // sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Execute the kernel and wait to finish + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out); + run.wait(); + + // Sync device to host memories + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Compare out to in + int errors = 0; + for (int i = 0; i < PASSTHROUGH_SIZE; i++) { + if (bufOut[i] != bufInA[i]) + errors++; + } + + // Print Pass/Fail result of our test + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/relu/CMakeLists.txt +++ b/programming_examples/basic/relu/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/basic/relu/Makefile index f862aad403..87e836fbfb 100644 --- a/programming_examples/basic/relu/Makefile +++ b/programming_examples/basic/relu/Makefile @@ -6,15 +6,15 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common - -targetname = testRelu +include ../../makefile-common all: build/final.xclbin build/insts.txt -build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc +targetname = testRelu + +build/bf16_relu.o: ../../../aie_kernels/relu.cc mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c ../$< -o ${@F} build/aie.mlir: aie2.py mkdir -p ${@D} @@ -28,7 +28,7 @@ build/final.xclbin: build/aie.mlir build/bf16_relu.o ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ @@ -42,7 +42,6 @@ run: ${targetname}.exe build/final.xclbin build/insts.txt run_g: ${targetname}.exe build/final.xclbin build/insts.txt ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536 - trace: ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json diff --git a/programming_examples/basic/vector_add_reduce/CMakeLists.txt b/programming_examples/basic/vector_add_reduce/CMakeLists.txt new file mode 100644 index 0000000000..76d48dfe36 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/CMakeLists.txt @@ -0,0 +1,69 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() \ No newline at end of file diff --git a/programming_examples/basic/vector_add_reduce/Makefile b/programming_examples/basic/vector_add_reduce/Makefile new file mode 100644 index 0000000000..37ca25abec --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/Makefile @@ -0,0 +1,76 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +ACDC_AIE = $(dir $(shell which aie-opt))/.. + +targetname = vector_max +devicename = ipu +col = 0 +CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} +KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/aie2/ + +all: build/final.xclbin build/insts.txt + +build/i32_add_reduce.o: ${KERNEL_LIB}/i32_add_reduce.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -c $< -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} > $@ + +build/final.xclbin: build/aie.mlir build/i32_add_reduce.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 +vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS} +vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/ + +vck5000: build/aie.mlir build/scale.o + cp build/scale.o* ./ + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +clean: clean_trace + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc \ No newline at end of file diff --git a/programming_examples/basic/vector_add_reduce/aie2.py b/programming_examples/basic/vector_add_reduce/aie2.py new file mode 100644 index 0000000000..2e0f980235 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/aie2.py @@ -0,0 +1,90 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + +import sys + + +def my_vector_max(): + N = 1024 + + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + if len(sys.argv) != 3: + raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") + + if sys.argv[1] == "ipu": + dev = AIEDevice.ipu + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_I_ty = T.memref(N, T.i32()) + memRef_O_ty = T.memref(1, T.i32()) + + # AIE Core Function declarations + + i32_add_reduce_vector = external_func( + "i32_add_reduce_vector", inputs=[memRef_I_ty, memRef_O_ty] + ) + + i32_add_reduce_scalar = external_func( + "i32_add_reduce_scalar", inputs=[memRef_I_ty, memRef_O_ty] + ) + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty) + of_out = object_fifo( + "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "i32_add_reduce.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + + call( + i32_add_reduce_vector, + [elem_in, elem_out], + ) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): + ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_vector_max() diff --git a/programming_examples/basic/vector_add_reduce/run.lit b/programming_examples/basic/vector_add_reduce/run.lit new file mode 100644 index 0000000000..b29f36cc11 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/run.lit @@ -0,0 +1,13 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir +// RUN: mkdir build +// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/programming_examples/basic/vector_add_reduce/run_vck5000.lit b/programming_examples/basic/vector_add_reduce/run_vck5000.lit new file mode 100644 index 0000000000..d314eea2a4 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/run_vck5000.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: hsa +// +// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir +// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib +// RUN: %run_on_vck5000 ./test.elf + diff --git a/programming_examples/basic/vector_add_reduce/test.cpp b/programming_examples/basic/vector_add_reduce/test.cpp new file mode 100644 index 0000000000..bd7438a0f9 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/test.cpp @@ -0,0 +1,261 @@ +///===- test.cpp -------------------------------------------000---*- C++ +///-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED + +using INOUT0_DATATYPE = std::int32_t; +using INOUT1_DATATYPE = std::int32_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = 1; // Not used in this example + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT1_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Load the kernel + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::int32_t max = (std::int32_t)-2147483648; + for (int i = 0; i < INOUT0_VOLUME; i++) { + std::int32_t next = test_utils::random_int32_t(100000); + if (next > max) + max = next; + bufInOut0[i] = next; + } + // Initialize Inout buffer 1 + // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME + + // trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + if (bufInOut1[0] != max) { + errors++; + std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl; + } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/vector_add_reduce/test_vck5000.cpp b/programming_examples/basic/vector_add_reduce/test_vck5000.cpp new file mode 100644 index 0000000000..25de4c7823 --- /dev/null +++ b/programming_examples/basic/vector_add_reduce/test_vck5000.cpp @@ -0,0 +1,146 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// Copyright (C) 2020-2022, Xilinx Inc. +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory_allocator.h" +#include "test_library.h" + +#include "aie_data_movement.cpp" +#include "aie_inc.cpp" + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +constexpr int DMA_COUNT = 64; + +void hsa_check_status(const std::string func_name, hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char *status_string(new char[1024]); + hsa_status_string(status, &status_string); + std::cout << func_name << " failed: " << status_string << std::endl; + delete[] status_string; + } else { + std::cout << func_name << " success" << std::endl; + } +} + +int main(int argc, char *argv[]) { + uint64_t row = 0; + uint64_t col = 6; + + std::vector queues; + uint32_t aie_max_queue_size(0); + + aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie(); + + // This is going to initialize HSA, create a queue + // and get an agent + int ret = mlir_aie_init_device(xaie); + + if (ret) { + std::cout << "[ERROR] Error when calling mlir_aie_init_device)" + << std::endl; + return -1; + } + + // Getting access to all of the HSA agents + std::vector agents = xaie->agents; + + if (agents.empty()) { + std::cout << "No agents found. Exiting." << std::endl; + return -1; + } + + std::cout << "Found " << agents.size() << " agents" << std::endl; + + hsa_queue_t *q = xaie->cmd_queue; + + // Adding to our vector of queues + queues.push_back(q); + assert(queues.size() > 0 && "No queues were sucesfully created!"); + + mlir_aie_configure_cores(xaie); + mlir_aie_configure_switchboxes(xaie); + mlir_aie_initialize_locks(xaie); + mlir_aie_configure_dmas(xaie); + mlir_aie_start_cores(xaie); + + // Allocating some device memory + ext_mem_model_t buf0, buf1, buf2; + uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT); + uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT); + uint32_t *out = (uint32_t *)mlir_aie_mem_alloc( + xaie, buf2, 4 /* For some reason can't do 1 */); + mlir_aie_sync_mem_dev(buf0); + mlir_aie_sync_mem_dev(buf1); + mlir_aie_sync_mem_dev(buf2); + + if (in_a == nullptr || in_b == nullptr || out == nullptr) { + std::cout << "Could not allocate in device memory" << std::endl; + return -1; + } + + out[0] = 0xdeface; + for (int i = 0; i < DMA_COUNT; i++) { + in_a[i] = i + 1; + } + + in_a[DMA_COUNT / 2] = 123456; + in_a[DMA_COUNT - 1] = 100; + + // printf("[EDDIE DEBUG] max_val before data movement is %d\n", + // mlir_aie_read_buffer_max_val(xaie, 0)); + + // Pass arguments in the order of dma_memcpys in the mlir + invoke_data_movement(queues[0], &agents[0], out, in_a); + + int errors = 0; + + uint32_t max_val = 0; + for (int i = 0; i < DMA_COUNT; i++) { + uint32_t s = in_a[i]; + if (max_val < s) { + max_val = s; + } + } + + // printf("[EDDIE DEBUG] max_val before data movement is %d\n", + // mlir_aie_read_buffer_max_val(xaie, 0)); + + if (*out != max_val) { + errors++; + printf("[ERROR] Maximum value is %d but kernel returned %d\n", max_val, + *out); + } + + // destroying the queue + hsa_queue_destroy(queues[0]); + + // Shutdown AIR and HSA + mlir_aie_deinit_libxaie(xaie); + + if (!errors) { + printf("PASS!\n"); + return 0; + } else { + printf("fail %d/%d.\n", errors, 1); + return -1; + } +} diff --git a/programming_examples/basic/vector_bias_add/CMakeLists.txt b/programming_examples/basic/vector_bias_add/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/vector_bias_add/CMakeLists.txt +++ b/programming_examples/basic/vector_bias_add/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_bias_add/Makefile b/programming_examples/basic/vector_bias_add/Makefile index 14e58914be..80ee07528b 100644 --- a/programming_examples/basic/vector_bias_add/Makefile +++ b/programming_examples/basic/vector_bias_add/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common +include ../../makefile-common all: build/final.xclbin diff --git a/programming_examples/basic/vector_exp/CMakeLists.txt b/programming_examples/basic/vector_exp/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/vector_exp/CMakeLists.txt +++ b/programming_examples/basic/vector_exp/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile index 9effbdadfb..02860c4f31 100755 --- a/programming_examples/basic/vector_exp/Makefile +++ b/programming_examples/basic/vector_exp/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common +include ../../makefile-common targetname = testExp @@ -17,15 +17,15 @@ build/vecexp.cc: bf16_exp_lut.mlir cd ${@D} && aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc build/vecexp.o: build/vecexp.cc - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} build/lut_based_ops.o: mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} build/exp.o: exp.cc mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o ar rvs $@ $+ @@ -42,7 +42,7 @@ build/final.xclbin: build/aie.mlir build/kernels.a ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/basic/vector_max/CMakeLists.txt b/programming_examples/basic/vector_max/CMakeLists.txt index d9f511062f..c64f84842b 100644 --- a/programming_examples/basic/vector_max/CMakeLists.txt +++ b/programming_examples/basic/vector_max/CMakeLists.txt @@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_max/Makefile b/programming_examples/basic/vector_max/Makefile index 4f5f1d5522..4450cdd20f 100755 --- a/programming_examples/basic/vector_max/Makefile +++ b/programming_examples/basic/vector_max/Makefile @@ -10,18 +10,26 @@ include ../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. -SHELL := /bin/bash - -targetname = vectorMax +targetname = vector_max devicename = ipu col = 0 +CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} +KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/ -all: build/final.xclbin +all: build/final.xclbin build/insts.txt -build/final.xclbin: build/aie.mlir +build/vector_max.o: ${KERNEL_LIB}/vector_max.cc mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ $@ + +build/final.xclbin: build/aie.mlir build/vector_max.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build @@ -34,30 +42,35 @@ else cp _build/${targetname} $@ endif -build/aie.mlir: aie2.py - mkdir -p ${@D} - python3 $< ${devicename} ${col} > $@ +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt # Changing variables when we target VCK5000 vck5000: devicename=xcvc1902 vck5000: col=6 +vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS} +vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/ -vck5000: build/aie.mlir +vck5000: build/aie.mlir build/scale.o + cp build/scale.o* ./ aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ - -I/opt/xaiengine/include \ - -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ - -I${ROCM_ROOT}/include \ - -L/opt/xaiengine/lib \ - -L/lib/x86_64-linux-gnu/ \ - test_vck5000.cpp \ - $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ - ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ - -Wl,-R/opt/xaiengine/lib \ - -Wl,-rpath,${ROCM_ROOT}/lib \ - -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf - -run: ${targetname}.exe build/final.xclbin build/insts.txt - ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf -clean: - rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe +clean: clean_trace + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc diff --git a/programming_examples/basic/vector_max/aie2.py b/programming_examples/basic/vector_max/aie2.py index 5d094a621d..fdc5deba52 100755 --- a/programming_examples/basic/vector_max/aie2.py +++ b/programming_examples/basic/vector_max/aie2.py @@ -17,7 +17,7 @@ def my_vector_max(): - N = 64 + N = 1024 buffer_depth = 2 @@ -35,39 +35,38 @@ def my_vector_max(): @device(dev) def device_body(): - memRef_ty = T.memref(N, T.i32()) + memRef_I_ty = T.memref(N, T.i32()) + memRef_O_ty = T.memref(1, T.i32()) # AIE Core Function declarations + vector_max = external_func("vector_max", inputs=[memRef_I_ty, memRef_O_ty]) + + scalar_max = external_func("scalar_max", inputs=[memRef_I_ty, memRef_O_ty]) + # Tile declarations ShimTile = tile(int(sys.argv[2]), 0) ComputeTile2 = tile(int(sys.argv[2]), 2) # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty) + of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty) + of_out = object_fifo( + "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty + ) # Set up compute tiles # Compute tile 2 - @core(ComputeTile2) + @core(ComputeTile2, "vector_max.o") def core_body(): - max_val = memref.alloc(1, T.i32()) - memref.store(arith.constant(0, T.i32()), max_val, [0]) - # Effective while(1) - for _ in for_(sys.maxsize): - # Number of sub-vector "tile" iterations - elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + for _ in for_(0xFFFFFFFF): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(N): - v0 = memref.load(elem_in, [i]) - v1 = memref.load(max_val, [0]) - v2 = arith.maxui(v1, v0) - memref.store(v2, max_val, [0]) - yield_([]) - - v3 = memref.load(max_val, [0]) - memref.store(v3, elem_out, [0]) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + + call( + vector_max, + [elem_in, elem_out], + ) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) yield_([]) @@ -75,8 +74,8 @@ def core_body(): # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) - @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) - def sequence(A, B, C): + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) ipu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_max/run.lit b/programming_examples/basic/vector_max/run.lit index a429e99221..b29f36cc11 100644 --- a/programming_examples/basic/vector_max/run.lit +++ b/programming_examples/basic/vector_max/run.lit @@ -4,6 +4,8 @@ // REQUIRES: ryzen_ai // // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir +// RUN: mkdir build +// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s diff --git a/programming_examples/basic/vector_max/test.cpp b/programming_examples/basic/vector_max/test.cpp index ffe277e862..bd7438a0f9 100644 --- a/programming_examples/basic/vector_max/test.cpp +++ b/programming_examples/basic/vector_max/test.cpp @@ -1,4 +1,5 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// +///===- test.cpp -------------------------------------------000---*- C++ +///-*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,9 +9,9 @@ // //===----------------------------------------------------------------------===// +#include #include #include -#include #include #include #include @@ -21,79 +22,59 @@ #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -constexpr int IN_SIZE = 64; -constexpr int OUT_SIZE = 1; +#include "test_utils.h" -namespace po = boost::program_options; +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} +using INOUT0_DATATYPE = std::int32_t; +using INOUT1_DATATYPE = std::int32_t; +#endif -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} +namespace po = boost::program_options; +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- int main(int argc, const char *argv[]) { - // Program arguments parsing + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ po::options_description desc("Allowed options"); - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); po::variables_map vm; + test_utils::add_default_options(desc); - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = 1; // Not used in this example - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT1_SIZE + trace_size; - int verbosity = vm["verbosity"].as(); + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); if (verbosity >= 1) std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - // Start the XRT test code + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ // Get a device handle unsigned int device_index = 0; auto device = xrt::device(device_index); @@ -103,6 +84,7 @@ int main(int argc, const char *argv[]) { std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; auto xclbin = xrt::xclbin(vm["xclbin"].as()); + // Load the kernel if (verbosity >= 1) std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; std::string Node = vm["kernel"].as(); @@ -110,85 +92,170 @@ int main(int argc, const char *argv[]) { // Get the kernel from the xclbin auto xkernels = xclbin.get_kernels(); auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { + [Node, verbosity](xrt::xclbin::kernel &k) { auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } return name.rfind(Node, 0) == 0; }); auto kernelName = xkernel.get_name(); + // Register xclbin if (verbosity >= 1) std::cout << "Registering xclbin: " << vm["xclbin"].as() << "\n"; - device.register_xclbin(xclbin); - // get a hardware context + // Get a hardware context if (verbosity >= 1) std::cout << "Getting hardware context.\n"; xrt::hw_context context(device, xclbin.get_uuid()); - // get a kernel handle + // Get a kernel handle if (verbosity >= 1) std::cout << "Getting handle to kernel:" << kernelName << "\n"; auto kernel = xrt::kernel(context, kernelName); + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; - int32_t *bufInA = bo_inA.map(); - std::vector srcVecA; - for (int i = 0; i < IN_SIZE; i++) - srcVecA.push_back(i + 1); - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); - - bufInA[IN_SIZE / 2] = 654321; - bufInA[IN_SIZE - 1] = 100; - + // Initialize instruction buffer void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::int32_t max = (std::int32_t)-2147483648; + for (int i = 0; i < INOUT0_VOLUME; i++) { + std::int32_t next = test_utils::random_int32_t(100000); + if (next > max) + max = next; + bufInOut0[i] = next; + } + // Initialize Inout buffer 1 + // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME + + // trace_size + + // Sync buffers to update input buffer values bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + int errors = 0; - uint32_t *bufOut = bo_out.map(); + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { - int errors = 0; + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } - uint32_t max_val = 0; - for (uint32_t i = 0; i < IN_SIZE; i++) { - if (*(bufInA + i) > max_val) { - max_val = *(bufInA + i); + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + if (bufInOut1[0] != max) { + errors++; + std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl; + } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); } - } - if (*bufOut != max_val) { - std::cout << "[ERROR] Maximum value is " << max_val - << " but kernel returned " << *bufOut << "\n"; - errors++; + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; } + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + if (!errors) { std::cout << "\nPASS!\n\n"; return 0; } else { - std::cout << "\nfailed.\n\n"; + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; return 1; } } diff --git a/programming_examples/basic/passthrough_hardware/CMakeLists.txt b/programming_examples/basic/vector_max_reduce/CMakeLists.txt similarity index 96% rename from programming_examples/basic/passthrough_hardware/CMakeLists.txt rename to programming_examples/basic/vector_max_reduce/CMakeLists.txt index 58cacd33af..c64f84842b 100644 --- a/programming_examples/basic/passthrough_hardware/CMakeLists.txt +++ b/programming_examples/basic/vector_max_reduce/CMakeLists.txt @@ -27,7 +27,7 @@ endif() set(TARGET_NAME test CACHE STRING "Target to be built") -SET (ProjectName proj_${TARGET_NAME}) +SET (ProjectName ${TARGET_NAME}) SET (currentTarget ${TARGET_NAME}) if ( WSL ) @@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_max_reduce/Makefile b/programming_examples/basic/vector_max_reduce/Makefile new file mode 100755 index 0000000000..981856f203 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/Makefile @@ -0,0 +1,76 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +ACDC_AIE = $(dir $(shell which aie-opt))/.. + +targetname = vector_max +devicename = ipu +col = 0 +CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} +KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/ + +all: build/final.xclbin build/insts.txt + +build/vector_max.o: ${KERNEL_LIB}/vector_max.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} > $@ + +build/final.xclbin: build/aie.mlir build/vector_max.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 +vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS} +vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/ + +vck5000: build/aie.mlir build/scale.o + cp build/scale.o* ./ + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +clean: clean_trace + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc \ No newline at end of file diff --git a/programming_examples/basic/vector_max_reduce/README.md b/programming_examples/basic/vector_max_reduce/README.md new file mode 100644 index 0000000000..50dc039a51 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/README.md @@ -0,0 +1,29 @@ + + +# Vector max + +This reference design can be run on either a RyzenAI IPU or a VCK5000. + +Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000. + + +To compile and run the design for IPU: +``` +make +make run +``` + +To compile and run the design for VCK5000: +``` +make vck5000 +./test.elf +``` + diff --git a/programming_examples/basic/vector_max_reduce/aie2.py b/programming_examples/basic/vector_max_reduce/aie2.py new file mode 100755 index 0000000000..fdc5deba52 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/aie2.py @@ -0,0 +1,86 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + +import sys + + +def my_vector_max(): + N = 1024 + + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + if len(sys.argv) != 3: + raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") + + if sys.argv[1] == "ipu": + dev = AIEDevice.ipu + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_I_ty = T.memref(N, T.i32()) + memRef_O_ty = T.memref(1, T.i32()) + + # AIE Core Function declarations + + vector_max = external_func("vector_max", inputs=[memRef_I_ty, memRef_O_ty]) + + scalar_max = external_func("scalar_max", inputs=[memRef_I_ty, memRef_O_ty]) + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty) + of_out = object_fifo( + "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "vector_max.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + + call( + vector_max, + [elem_in, elem_out], + ) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): + ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_vector_max() diff --git a/programming_examples/basic/vector_max_reduce/run.lit b/programming_examples/basic/vector_max_reduce/run.lit new file mode 100644 index 0000000000..b29f36cc11 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/run.lit @@ -0,0 +1,13 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir +// RUN: mkdir build +// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/programming_examples/basic/vector_max_reduce/run_vck5000.lit b/programming_examples/basic/vector_max_reduce/run_vck5000.lit new file mode 100644 index 0000000000..d314eea2a4 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/run_vck5000.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: hsa +// +// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir +// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib +// RUN: %run_on_vck5000 ./test.elf + diff --git a/programming_examples/basic/vector_max_reduce/test.cpp b/programming_examples/basic/vector_max_reduce/test.cpp new file mode 100644 index 0000000000..bd7438a0f9 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/test.cpp @@ -0,0 +1,261 @@ +///===- test.cpp -------------------------------------------000---*- C++ +///-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED + +using INOUT0_DATATYPE = std::int32_t; +using INOUT1_DATATYPE = std::int32_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = 1; // Not used in this example + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT1_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Load the kernel + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::int32_t max = (std::int32_t)-2147483648; + for (int i = 0; i < INOUT0_VOLUME; i++) { + std::int32_t next = test_utils::random_int32_t(100000); + if (next > max) + max = next; + bufInOut0[i] = next; + } + // Initialize Inout buffer 1 + // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME + + // trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + if (bufInOut1[0] != max) { + errors++; + std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl; + } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/vector_max_reduce/test_vck5000.cpp b/programming_examples/basic/vector_max_reduce/test_vck5000.cpp new file mode 100644 index 0000000000..25de4c7823 --- /dev/null +++ b/programming_examples/basic/vector_max_reduce/test_vck5000.cpp @@ -0,0 +1,146 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// Copyright (C) 2020-2022, Xilinx Inc. +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory_allocator.h" +#include "test_library.h" + +#include "aie_data_movement.cpp" +#include "aie_inc.cpp" + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +constexpr int DMA_COUNT = 64; + +void hsa_check_status(const std::string func_name, hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char *status_string(new char[1024]); + hsa_status_string(status, &status_string); + std::cout << func_name << " failed: " << status_string << std::endl; + delete[] status_string; + } else { + std::cout << func_name << " success" << std::endl; + } +} + +int main(int argc, char *argv[]) { + uint64_t row = 0; + uint64_t col = 6; + + std::vector queues; + uint32_t aie_max_queue_size(0); + + aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie(); + + // This is going to initialize HSA, create a queue + // and get an agent + int ret = mlir_aie_init_device(xaie); + + if (ret) { + std::cout << "[ERROR] Error when calling mlir_aie_init_device)" + << std::endl; + return -1; + } + + // Getting access to all of the HSA agents + std::vector agents = xaie->agents; + + if (agents.empty()) { + std::cout << "No agents found. Exiting." << std::endl; + return -1; + } + + std::cout << "Found " << agents.size() << " agents" << std::endl; + + hsa_queue_t *q = xaie->cmd_queue; + + // Adding to our vector of queues + queues.push_back(q); + assert(queues.size() > 0 && "No queues were sucesfully created!"); + + mlir_aie_configure_cores(xaie); + mlir_aie_configure_switchboxes(xaie); + mlir_aie_initialize_locks(xaie); + mlir_aie_configure_dmas(xaie); + mlir_aie_start_cores(xaie); + + // Allocating some device memory + ext_mem_model_t buf0, buf1, buf2; + uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT); + uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT); + uint32_t *out = (uint32_t *)mlir_aie_mem_alloc( + xaie, buf2, 4 /* For some reason can't do 1 */); + mlir_aie_sync_mem_dev(buf0); + mlir_aie_sync_mem_dev(buf1); + mlir_aie_sync_mem_dev(buf2); + + if (in_a == nullptr || in_b == nullptr || out == nullptr) { + std::cout << "Could not allocate in device memory" << std::endl; + return -1; + } + + out[0] = 0xdeface; + for (int i = 0; i < DMA_COUNT; i++) { + in_a[i] = i + 1; + } + + in_a[DMA_COUNT / 2] = 123456; + in_a[DMA_COUNT - 1] = 100; + + // printf("[EDDIE DEBUG] max_val before data movement is %d\n", + // mlir_aie_read_buffer_max_val(xaie, 0)); + + // Pass arguments in the order of dma_memcpys in the mlir + invoke_data_movement(queues[0], &agents[0], out, in_a); + + int errors = 0; + + uint32_t max_val = 0; + for (int i = 0; i < DMA_COUNT; i++) { + uint32_t s = in_a[i]; + if (max_val < s) { + max_val = s; + } + } + + // printf("[EDDIE DEBUG] max_val before data movement is %d\n", + // mlir_aie_read_buffer_max_val(xaie, 0)); + + if (*out != max_val) { + errors++; + printf("[ERROR] Maximum value is %d but kernel returned %d\n", max_val, + *out); + } + + // destroying the queue + hsa_queue_destroy(queues[0]); + + // Shutdown AIR and HSA + mlir_aie_deinit_libxaie(xaie); + + if (!errors) { + printf("PASS!\n"); + return 0; + } else { + printf("fail %d/%d.\n", errors, 1); + return -1; + } +} diff --git a/programming_examples/basic/vector_min/CMakeLists.txt b/programming_examples/basic/vector_min/CMakeLists.txt index d9f511062f..c64f84842b 100644 --- a/programming_examples/basic/vector_min/CMakeLists.txt +++ b/programming_examples/basic/vector_min/CMakeLists.txt @@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_min/Makefile b/programming_examples/basic/vector_min/Makefile index 5dec01e083..a5d96b0149 100755 --- a/programming_examples/basic/vector_min/Makefile +++ b/programming_examples/basic/vector_min/Makefile @@ -10,18 +10,26 @@ include ../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. -SHELL := /bin/bash - -targetname = vectorMin +targetname = vector_min devicename = ipu col = 0 +CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} +KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/ -all: build/final.xclbin +all: build/final.xclbin build/insts.txt -build/final.xclbin: build/aie.mlir +build/vector_min.o: ${KERNEL_LIB}/vector_min.cc mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ $@ + +build/final.xclbin: build/aie.mlir build/vector_min.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build @@ -34,30 +42,35 @@ else cp _build/${targetname} $@ endif -build/aie.mlir: aie2.py - mkdir -p ${@D} - python3 $< ${devicename} ${col} > $@ +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt # Changing variables when we target VCK5000 vck5000: devicename=xcvc1902 vck5000: col=6 +vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS} +vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/ -vck5000: build/aie.mlir +vck5000: build/aie.mlir build/scale.o + cp build/scale.o* ./ aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ - -I/opt/xaiengine/include \ - -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ - -I${ROCM_ROOT}/include \ - -L/opt/xaiengine/lib \ - -L/lib/x86_64-linux-gnu/ \ - test_vck5000.cpp \ - $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ - ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ - -Wl,-R/opt/xaiengine/lib \ - -Wl,-rpath,${ROCM_ROOT}/lib \ - -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf - -run: ${targetname}.exe build/final.xclbin build/insts.txt - ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf -clean: - rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe +clean: clean_trace + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_min.o* vector_min.cc diff --git a/programming_examples/basic/vector_min/Makefile.pjr b/programming_examples/basic/vector_min/Makefile.pjr new file mode 100755 index 0000000000..9b0ffca19e --- /dev/null +++ b/programming_examples/basic/vector_min/Makefile.pjr @@ -0,0 +1,67 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../makefile-common + +ACDC_AIE = $(dir $(shell which aie-opt))/.. + +SHELL := /bin/bash + +targetname = vectorMin +devicename = ipu +col = 0 + +all: build/final.xclbin + +build/vector_min.o: + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/generic/vector_min.cc -o ${@F} + +build/final.xclbin: build/aie.mlir build/vector_min.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ $@ + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 + +vck5000: build/aie.mlir + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +clean: + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe diff --git a/programming_examples/basic/vector_min/README.md b/programming_examples/basic/vector_min/README.md index 3581b595ee..50dc039a51 100644 --- a/programming_examples/basic/vector_min/README.md +++ b/programming_examples/basic/vector_min/README.md @@ -8,11 +8,11 @@ // //===----------------------------------------------------------------------===//--> -# Vector min +# Vector max This reference design can be run on either a RyzenAI IPU or a VCK5000. -Single tile traverses through a vector in memory and returns the min value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary min value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000. +Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000. To compile and run the design for IPU: diff --git a/programming_examples/basic/vector_min/aie2.py b/programming_examples/basic/vector_min/aie2.py index f0b931431a..c235b7f278 100755 --- a/programming_examples/basic/vector_min/aie2.py +++ b/programming_examples/basic/vector_min/aie2.py @@ -17,7 +17,7 @@ def my_vector_max(): - N = 64 + N = 1024 buffer_depth = 2 @@ -35,39 +35,38 @@ def my_vector_max(): @device(dev) def device_body(): - memRef_ty = T.memref(N, T.i32()) + memRef_I_ty = T.memref(N, T.i32()) + memRef_O_ty = T.memref(1, T.i32()) # AIE Core Function declarations + vector_min = external_func("vector_min", inputs=[memRef_I_ty, memRef_O_ty]) + + scalar_min = external_func("scalar_min", inputs=[memRef_I_ty, memRef_O_ty]) + # Tile declarations ShimTile = tile(int(sys.argv[2]), 0) ComputeTile2 = tile(int(sys.argv[2]), 2) # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty) + of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty) + of_out = object_fifo( + "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty + ) # Set up compute tiles # Compute tile 2 - @core(ComputeTile2) + @core(ComputeTile2, "vector_min.o") def core_body(): - min_val = memref.alloc(1, T.i32()) - memref.store(arith.constant(1000, T.i32()), min_val, [0]) - # Effective while(1) - for _ in for_(sys.maxsize): - # Number of sub-vector "tile" iterations - elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + for _ in for_(0xFFFFFFFF): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(N): - v0 = memref.load(elem_in, [i]) - v1 = memref.load(min_val, [0]) - v2 = arith.minui(v1, v0) - memref.store(v2, min_val, [0]) - yield_([]) - - v3 = memref.load(min_val, [0]) - memref.store(v3, elem_out, [0]) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + + call( + vector_min, + [elem_in, elem_out], + ) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) yield_([]) @@ -75,8 +74,8 @@ def core_body(): # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) - @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) - def sequence(A, B, C): + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) ipu_sync(column=0, row=0, direction=0, channel=0) diff --git a/programming_examples/basic/vector_min/run.lit b/programming_examples/basic/vector_min/run.lit index a429e99221..b29f36cc11 100644 --- a/programming_examples/basic/vector_min/run.lit +++ b/programming_examples/basic/vector_min/run.lit @@ -4,6 +4,8 @@ // REQUIRES: ryzen_ai // // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir +// RUN: mkdir build +// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s diff --git a/programming_examples/basic/vector_min/run_vck5000.lit b/programming_examples/basic/vector_min/run_vck5000.lit index 83b350c50d..d314eea2a4 100644 --- a/programming_examples/basic/vector_min/run_vck5000.lit +++ b/programming_examples/basic/vector_min/run_vck5000.lit @@ -5,5 +5,5 @@ // // RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir // RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib -// RUN: %run_on_vck5000 ./test.elf +// RUN: %run_on_vck5000 ./test.elf diff --git a/programming_examples/basic/vector_min/test.cpp b/programming_examples/basic/vector_min/test.cpp index 61339fc01e..0c596a7068 100644 --- a/programming_examples/basic/vector_min/test.cpp +++ b/programming_examples/basic/vector_min/test.cpp @@ -1,4 +1,5 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// +///===- test.cpp -------------------------------------------000---*- C++ +///-*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,9 +9,9 @@ // //===----------------------------------------------------------------------===// +#include #include #include -#include #include #include #include @@ -21,79 +22,59 @@ #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -constexpr int IN_SIZE = 64; -constexpr int OUT_SIZE = 1; +#include "test_utils.h" -namespace po = boost::program_options; +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} +using INOUT0_DATATYPE = std::int32_t; +using INOUT1_DATATYPE = std::int32_t; +#endif -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} +namespace po = boost::program_options; +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- int main(int argc, const char *argv[]) { - // Program arguments parsing + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ po::options_description desc("Allowed options"); - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); po::variables_map vm; + test_utils::add_default_options(desc); - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = 1; // Not used in this example - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT1_SIZE + trace_size; - int verbosity = vm["verbosity"].as(); + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); if (verbosity >= 1) std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - // Start the XRT test code + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ // Get a device handle unsigned int device_index = 0; auto device = xrt::device(device_index); @@ -103,6 +84,7 @@ int main(int argc, const char *argv[]) { std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; auto xclbin = xrt::xclbin(vm["xclbin"].as()); + // Load the kernel if (verbosity >= 1) std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; std::string Node = vm["kernel"].as(); @@ -110,85 +92,170 @@ int main(int argc, const char *argv[]) { // Get the kernel from the xclbin auto xkernels = xclbin.get_kernels(); auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { + [Node, verbosity](xrt::xclbin::kernel &k) { auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } return name.rfind(Node, 0) == 0; }); auto kernelName = xkernel.get_name(); + // Register xclbin if (verbosity >= 1) std::cout << "Registering xclbin: " << vm["xclbin"].as() << "\n"; - device.register_xclbin(xclbin); - // get a hardware context + // Get a hardware context if (verbosity >= 1) std::cout << "Getting hardware context.\n"; xrt::hw_context context(device, xclbin.get_uuid()); - // get a kernel handle + // Get a kernel handle if (verbosity >= 1) std::cout << "Getting handle to kernel:" << kernelName << "\n"; auto kernel = xrt::kernel(context, kernelName); + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; - int32_t *bufInA = bo_inA.map(); - std::vector srcVecA; - for (int i = 100; i < 100 + IN_SIZE; i++) - srcVecA.push_back(i + 1); - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); - - bufInA[IN_SIZE / 2] = 4; - bufInA[IN_SIZE - 1] = 100; - + // Initialize instruction buffer void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::int32_t min = (std::int32_t)2147483647; + for (int i = 0; i < INOUT0_VOLUME; i++) { + std::int32_t next = test_utils::random_int32_t(100000); + if (next < min) + min = next; + bufInOut0[i] = next; + } + // Initialize Inout buffer 1 + // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME + + // trace_size + + // Sync buffers to update input buffer values bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + int errors = 0; - uint32_t *bufOut = bo_out.map(); + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { - int errors = 0; + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } - uint32_t min_val = 1000000; - for (uint32_t i = 0; i < IN_SIZE; i++) { - if (*(bufInA + i) < min_val) { - min_val = *(bufInA + i); + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + if (bufInOut1[0] != min) { + errors++; + std::cout << "min is " << min << " calc " << bufInOut1[0] << std::endl; + } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); } - } - if (*bufOut != min_val) { - std::cout << "[ERROR] Min value is " << min_val << " but kernel returned " - << *bufOut << "\n"; - errors++; + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; } + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + if (!errors) { std::cout << "\nPASS!\n\n"; return 0; } else { - std::cout << "\nfailed.\n\n"; + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; return 1; } } diff --git a/programming_examples/basic/vector_min_reduce/CMakeLists.txt b/programming_examples/basic/vector_min_reduce/CMakeLists.txt new file mode 100644 index 0000000000..c64f84842b --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/CMakeLists.txt @@ -0,0 +1,69 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ../../../programming_examples/utils +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/vector_min_reduce/Makefile b/programming_examples/basic/vector_min_reduce/Makefile new file mode 100755 index 0000000000..ed85afa75f --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/Makefile @@ -0,0 +1,76 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +ACDC_AIE = $(dir $(shell which aie-opt))/.. + +targetname = vector_min +devicename = ipu +col = 0 +CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} +KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/ + +all: build/final.xclbin build/insts.txt + +build/vector_min.o: ${KERNEL_LIB}/vector_min.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} > $@ + +build/final.xclbin: build/aie.mlir build/vector_min.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE + +trace: + ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json + +clean_trace: + rm -rf tmpTrace trace.txt + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 +vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS} +vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/ + +vck5000: build/aie.mlir build/scale.o + cp build/scale.o* ./ + aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \ + -I${ROCM_ROOT}/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + test_vck5000.cpp \ + $(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,-rpath,${ROCM_ROOT}/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + +clean: clean_trace + rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_min.o* vector_min.cc \ No newline at end of file diff --git a/programming_examples/basic/vector_min_reduce/README.md b/programming_examples/basic/vector_min_reduce/README.md new file mode 100644 index 0000000000..50dc039a51 --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/README.md @@ -0,0 +1,29 @@ + + +# Vector max + +This reference design can be run on either a RyzenAI IPU or a VCK5000. + +Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000. + + +To compile and run the design for IPU: +``` +make +make run +``` + +To compile and run the design for VCK5000: +``` +make vck5000 +./test.elf +``` + diff --git a/programming_examples/basic/vector_min_reduce/aie2.py b/programming_examples/basic/vector_min_reduce/aie2.py new file mode 100755 index 0000000000..c235b7f278 --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/aie2.py @@ -0,0 +1,86 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + +import sys + + +def my_vector_max(): + N = 1024 + + buffer_depth = 2 + + with mlir_mod_ctx() as ctx: + + if len(sys.argv) != 3: + raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") + + if sys.argv[1] == "ipu": + dev = AIEDevice.ipu + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_I_ty = T.memref(N, T.i32()) + memRef_O_ty = T.memref(1, T.i32()) + + # AIE Core Function declarations + + vector_min = external_func("vector_min", inputs=[memRef_I_ty, memRef_O_ty]) + + scalar_min = external_func("scalar_min", inputs=[memRef_I_ty, memRef_O_ty]) + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty) + of_out = object_fifo( + "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "vector_min.o") + def core_body(): + for _ in for_(0xFFFFFFFF): + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + + call( + vector_min, + [elem_in, elem_out], + ) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty) + def sequence(A, C): + ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_vector_max() diff --git a/programming_examples/basic/vector_min_reduce/run.lit b/programming_examples/basic/vector_min_reduce/run.lit new file mode 100644 index 0000000000..b29f36cc11 --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/run.lit @@ -0,0 +1,13 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir +// RUN: mkdir build +// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/programming_examples/basic/vector_min_reduce/run_vck5000.lit b/programming_examples/basic/vector_min_reduce/run_vck5000.lit new file mode 100644 index 0000000000..d314eea2a4 --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/run_vck5000.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: hsa +// +// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir +// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib +// RUN: %run_on_vck5000 ./test.elf + diff --git a/programming_examples/basic/vector_min_reduce/test.cpp b/programming_examples/basic/vector_min_reduce/test.cpp new file mode 100644 index 0000000000..0c596a7068 --- /dev/null +++ b/programming_examples/basic/vector_min_reduce/test.cpp @@ -0,0 +1,261 @@ +///===- test.cpp -------------------------------------------000---*- C++ +///-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED + +using INOUT0_DATATYPE = std::int32_t; +using INOUT1_DATATYPE = std::int32_t; +#endif + +namespace po = boost::program_options; + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + test_utils::add_default_options(desc); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + // ------------------------------------------------------ + // Configure this to match your design's buffer size + // ------------------------------------------------------ + int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example + int INOUT1_VOLUME = 1; // Not used in this example + + size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); + size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); + + // TODO Remove trace for now? + size_t OUT_SIZE = INOUT1_SIZE + trace_size; + + srand(time(NULL)); + + // Load instruction sequence + std::vector instr_v = + test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Load the kernel + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inout0 = + xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inout1 = + xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initialize instruction buffer + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Initialize Inout buffer 0 + INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); + std::int32_t min = (std::int32_t)2147483647; + for (int i = 0; i < INOUT0_VOLUME; i++) { + std::int32_t next = test_utils::random_int32_t(100000); + if (next < min) + min = next; + bufInOut0[i] = next; + } + // Initialize Inout buffer 1 + // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME + + // trace_size + + // Sync buffers to update input buffer values + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // ------------------------------------------------------ + // Initialize run configs + // ------------------------------------------------------ + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; + + int errors = 0; + + // ------------------------------------------------------ + // Main run loop + // ------------------------------------------------------ + for (unsigned iter = 0; iter < num_iter; iter++) { + + if (verbosity >= 1) { + std::cout << "Running Kernel.\n"; + } + + // Run kernel + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); + + if (iter < n_warmup_iterations) { + /* Warmup iterations do not count towards average runtime. */ + continue; + } + + // Copy output results and verify they are correct + if (do_verify) { + if (verbosity >= 1) { + std::cout << "Verifying results ..." << std::endl; + } + auto vstart = std::chrono::system_clock::now(); + if (bufInOut1[0] != min) { + errors++; + std::cout << "min is " << min << " calc " << bufInOut1[0] << std::endl; + } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + if (verbosity >= 1) { + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } + } else { + if (verbosity >= 1) + std::cout << "WARNING: results not verified." << std::endl; + } + + // Write trace values if trace_size > 0 + if (trace_size > 0) { + test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size, + vm["trace_file"].as()); + } + + // Accumulate run times + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + // ------------------------------------------------------ + // Print verification and timing results + // ------------------------------------------------------ + + // TODO - Mac count to guide gflops + float macs = 0; + + std::cout << std::endl + << "Avg NPU time: " << npu_time_total / n_iterations << "us." + << std::endl; + if (macs > 0) + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU time: " << npu_time_min << "us." << std::endl; + if (macs > 0) + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) + << std::endl; + + std::cout << std::endl + << "Max NPU time: " << npu_time_max << "us." << std::endl; + if (macs > 0) + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) + << std::endl; + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nError count: " << errors << "\n\n"; + std::cout << "\nFailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/vector_scalar/CMakeLists.txt b/programming_examples/basic/vector_scalar/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/vector_scalar/CMakeLists.txt +++ b/programming_examples/basic/vector_scalar/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_scalar/Makefile b/programming_examples/basic/vector_scalar/Makefile index 4cf3d56bbd..688dcfe2cf 100755 --- a/programming_examples/basic/vector_scalar/Makefile +++ b/programming_examples/basic/vector_scalar/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common +include ../../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. diff --git a/programming_examples/basic/vector_softmax/CMakeLists.txt b/programming_examples/basic/vector_softmax/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/vector_softmax/CMakeLists.txt +++ b/programming_examples/basic/vector_softmax/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/vector_softmax/Makefile b/programming_examples/basic/vector_softmax/Makefile index e0ba2814ca..eea6b707f5 100755 --- a/programming_examples/basic/vector_softmax/Makefile +++ b/programming_examples/basic/vector_softmax/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common +include ../../makefile-common targetname = testExp @@ -17,15 +17,15 @@ build/vecexp.cc: bf16_softmax.mlir cd ${@D} && aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc build/vecexp.o: build/vecexp.cc - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} build/lut_based_ops.o: mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F} build/exp.o: exp.cc mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F} build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o ar rvs $@ $+ @@ -42,7 +42,7 @@ build/final.xclbin: build/aie.mlir build/kernels.a ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/basic/vector_sum/CMakeLists.txt b/programming_examples/basic/vector_sum/CMakeLists.txt index d9f511062f..20452d080e 100644 --- a/programming_examples/basic/vector_sum/CMakeLists.txt +++ b/programming_examples/basic/vector_sum/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,14 +45,18 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) +set_property(TARGET ${currentTarget} PROPERTY CXX_STANDARD 23) + target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/weight_expand/CMakeLists.txt b/programming_examples/basic/weight_expand/CMakeLists.txt index d9f511062f..c4ca0825d4 100644 --- a/programming_examples/basic/weight_expand/CMakeLists.txt +++ b/programming_examples/basic/weight_expand/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/basic/weight_expand/Makefile b/programming_examples/basic/weight_expand/Makefile index 9fa9762e11..641b4902b3 100755 --- a/programming_examples/basic/weight_expand/Makefile +++ b/programming_examples/basic/weight_expand/Makefile @@ -6,12 +6,12 @@ # ##===----------------------------------------------------------------------===## -include ../makefile-common - -targetname = expand +include ../../makefile-common all: build/final.xclbin build/insts.txt ${targetname}.exe +targetname = expand + build/%.o: %.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} @@ -28,7 +28,7 @@ build/final.xclbin: build/aie.mlir build/expand.o ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/basic/makefile-common b/programming_examples/makefile-common similarity index 100% rename from programming_examples/basic/makefile-common rename to programming_examples/makefile-common diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt index c64f84842b..c4ca0825d4 100644 --- a/programming_examples/ml/eltwise_add/CMakeLists.txt +++ b/programming_examples/ml/eltwise_add/CMakeLists.txt @@ -13,9 +13,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") @@ -40,6 +45,7 @@ project(${ProjectName}) find_package(Boost REQUIRED) add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -48,7 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) target_include_directories (${currentTarget} PUBLIC ${XRT_INC_DIR} ${Boost_INCLUDE_DIRS} - ../../../programming_examples/utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ) target_link_directories(${currentTarget} PUBLIC diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile index dd75274321..ebaf16c2f9 100644 --- a/programming_examples/ml/eltwise_add/Makefile +++ b/programming_examples/ml/eltwise_add/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../../../programming_examples/basic/makefile-common +include ../../makefile-common all: build/final.xclbin @@ -14,7 +14,7 @@ targetname = myEltwiseAdd build/add.o: mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/add.cc -o ${@F} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/add.cc -o ${@F} build/aie.mlir: aie2.py mkdir -p ${@D} @@ -28,8 +28,7 @@ build/final.xclbin: build/aie.mlir build/add.o ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build -# cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} - cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/utils/makefile-common b/programming_examples/utils/makefile-common deleted file mode 100644 index bca449a071..0000000000 --- a/programming_examples/utils/makefile-common +++ /dev/null @@ -1,22 +0,0 @@ -# Contains common definitions used across the Makefiles of ipu-xrt tests. -REPO_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/../../..) - -# VITIS related variables -VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../) -VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools -VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib -VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib - -CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20 -CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR} - -CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include -CHESSCCWRAP2_FLAGS = aie2 -I ${VITIS_AIETOOLS_DIR}/include - -TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no) -ifeq ($(TEST_POWERSHELL),yes) - powershell = powershell.exe -else - powershell = -endif diff --git a/programming_examples/vision/color_detect/CMakeLists.txt b/programming_examples/vision/color_detect/CMakeLists.txt index c92b647d43..be4d1e64c4 100644 --- a/programming_examples/vision/color_detect/CMakeLists.txt +++ b/programming_examples/vision/color_detect/CMakeLists.txt @@ -14,9 +14,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") @@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}") add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC target_include_directories (${currentTarget} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ${XRT_INC_DIR} ${OpenCV_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile index d57db0abe7..3311c71a9d 100755 --- a/programming_examples/vision/color_detect/Makefile +++ b/programming_examples/vision/color_detect/Makefile @@ -46,7 +46,8 @@ build/${targetname}.exe: test.cpp mkdir -p ${@D} rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORDETECT_WIDTH=${COLORDETECT_WIDTH} -DCOLORDETECT_HEIGHT=${COLORDETECT_HEIGHT} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORDETECT_WIDTH=${COLORDETECT_WIDTH} -DCOLORDETECT_HEIGHT=${COLORDETECT_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 + cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/vision/color_detect/test.cpp b/programming_examples/vision/color_detect/test.cpp index 36762c4197..a2eac7ae3f 100644 --- a/programming_examples/vision/color_detect/test.cpp +++ b/programming_examples/vision/color_detect/test.cpp @@ -162,8 +162,9 @@ int main(int argc, const char *argv[]) { xrt::device device; xrt::kernel kernel; - initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), - vm["kernel"].as()); + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); /* **************************************************************************** diff --git a/programming_examples/vision/color_threshold/CMakeLists.txt b/programming_examples/vision/color_threshold/CMakeLists.txt index bf67da2180..d04bc7d451 100644 --- a/programming_examples/vision/color_threshold/CMakeLists.txt +++ b/programming_examples/vision/color_threshold/CMakeLists.txt @@ -14,9 +14,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") @@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}") add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC target_include_directories (${currentTarget} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ${XRT_INC_DIR} ${OpenCV_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile index cbd478b3d9..591b93de7f 100644 --- a/programming_examples/vision/color_threshold/Makefile +++ b/programming_examples/vision/color_threshold/Makefile @@ -42,7 +42,7 @@ build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.m ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORTHRESHOLD_WIDTH=${COLORTHRESHOLD_WIDTH} -DCOLORTHRESHOLD_HEIGHT=${COLORTHRESHOLD_HEIGHT} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORTHRESHOLD_WIDTH=${COLORTHRESHOLD_WIDTH} -DCOLORTHRESHOLD_HEIGHT=${COLORTHRESHOLD_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/vision/color_threshold/test.cpp b/programming_examples/vision/color_threshold/test.cpp index 66b8177fe7..9759948d2b 100644 --- a/programming_examples/vision/color_threshold/test.cpp +++ b/programming_examples/vision/color_threshold/test.cpp @@ -71,19 +71,6 @@ int main(int argc, const char *argv[]) { po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - /* **************************************************************************** * Load instruction sequence @@ -104,8 +91,9 @@ int main(int argc, const char *argv[]) { xrt::device device; xrt::kernel kernel; - initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), - vm["kernel"].as()); + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); /* **************************************************************************** diff --git a/programming_examples/vision/edge_detect/CMakeLists.txt b/programming_examples/vision/edge_detect/CMakeLists.txt index 6e7bb43b66..36f2a43728 100644 --- a/programming_examples/vision/edge_detect/CMakeLists.txt +++ b/programming_examples/vision/edge_detect/CMakeLists.txt @@ -14,9 +14,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") @@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}") add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC target_include_directories (${currentTarget} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ${XRT_INC_DIR} ${OpenCV_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile index a0d50006e1..1524daa510 100755 --- a/programming_examples/vision/edge_detect/Makefile +++ b/programming_examples/vision/edge_detect/Makefile @@ -45,7 +45,7 @@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WID ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DEDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} -DEDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DEDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} -DEDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/vision/edge_detect/test.cpp b/programming_examples/vision/edge_detect/test.cpp index 57389fffbf..27b68aa3d4 100644 --- a/programming_examples/vision/edge_detect/test.cpp +++ b/programming_examples/vision/edge_detect/test.cpp @@ -198,8 +198,9 @@ int main(int argc, const char *argv[]) { xrt::device device; xrt::kernel kernel; - initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), - vm["kernel"].as()); + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); /* **************************************************************************** diff --git a/programming_examples/vision/passthrough/CMakeLists.txt b/programming_examples/vision/passthrough/CMakeLists.txt index 8493494697..0adfebde01 100644 --- a/programming_examples/vision/passthrough/CMakeLists.txt +++ b/programming_examples/vision/passthrough/CMakeLists.txt @@ -14,9 +14,14 @@ # cmake needs this line cmake_minimum_required(VERSION 3.1) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + find_program(WSL NAMES powershell.exe) if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") @@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}") add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp test.cpp ) @@ -62,7 +67,8 @@ target_compile_definitions(${currentTarget} PUBLIC ) target_include_directories (${currentTarget} PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib ${XRT_INC_DIR} ${OpenCV_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} diff --git a/programming_examples/vision/passthrough/Makefile b/programming_examples/vision/passthrough/Makefile index 55ccefd696..b9d405f85f 100644 --- a/programming_examples/vision/passthrough/Makefile +++ b/programming_examples/vision/passthrough/Makefile @@ -8,7 +8,7 @@ include ../../makefile-common -VPATH := ../vision_kernels +VPATH := ../../../aie_kernels/aie_generic PASSTHROUGH_WIDTH = 1920 PASSTHROUGH_HEIGHT = 1080 @@ -38,7 +38,7 @@ build/final_${PASSTHROUGH_WIDTH}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_W ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_WIDTH=${PASSTHROUGH_WIDTH} -DPASSTHROUGH_HEIGHT=${PASSTHROUGH_HEIGHT} + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_WIDTH=${PASSTHROUGH_WIDTH} -DPASSTHROUGH_HEIGHT=${PASSTHROUGH_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" cp _build/${targetname}.exe $@ diff --git a/programming_examples/vision/passthrough/run.lit b/programming_examples/vision/passthrough/run.lit index 8d17f1386b..a3cff036a8 100644 --- a/programming_examples/vision/passthrough/run.lit +++ b/programming_examples/vision/passthrough/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, opencv, chess // ALLOW_RETRIES: 3 // -// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../vision_kernels/passThrough.cc -o passThrough.cc.o +// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o // RUN: %python %S/aie2.py 1920 1080 | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_WIDTH=1920 -DPASSTHROUGH_HEIGHT=1080 -I%S/../../utils %S/../../utils/xrtUtils.cpp %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem diff --git a/programming_examples/vision/passthrough/test.cpp b/programming_examples/vision/passthrough/test.cpp index ff4ceb49c2..b5136fdfc1 100644 --- a/programming_examples/vision/passthrough/test.cpp +++ b/programming_examples/vision/passthrough/test.cpp @@ -108,8 +108,9 @@ int main(int argc, const char *argv[]) { xrt::device device; xrt::kernel kernel; - initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), - vm["kernel"].as()); + test_utils::init_xrt_load_kernel(device, kernel, verbosity, + vm["xclbin"].as(), + vm["kernel"].as()); // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), diff --git a/programming_examples/utils/test_utils.py b/python/test_utils.py similarity index 99% rename from programming_examples/utils/test_utils.py rename to python/test_utils.py index 5e187f6330..c4b7086031 100644 --- a/programming_examples/utils/test_utils.py +++ b/python/test_utils.py @@ -5,7 +5,6 @@ import argparse - # options def parse_args(args): p = argparse.ArgumentParser() diff --git a/programming_examples/utils/test_utils.h b/runtime_lib/test_lib/test_utils.cpp similarity index 53% rename from programming_examples/utils/test_utils.h rename to runtime_lib/test_lib/test_utils.cpp index 8eedcde857..320a60a858 100644 --- a/programming_examples/utils/test_utils.h +++ b/runtime_lib/test_lib/test_utils.cpp @@ -1,4 +1,4 @@ -//===- test_utils.h ----------------------------000---*- C++ -*-===// +//===- test_utils.cpp ----------------------------000---*- C++ -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,21 +10,14 @@ // This file contains common helper functions for the generic host code -#ifndef TEST_UTILS_H -#define TEST_UTILS_H - -#include -#include - -namespace test_utils { - -namespace po = boost::program_options; +#include "test_utils.h" // -------------------------------------------------------------------------- // Command Line Argument Handling // -------------------------------------------------------------------------- -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { +void test_utils::check_arg_file_exists(po::variables_map &vm_in, + std::string name) { if (!vm_in.count(name)) { throw std::runtime_error("Error: no " + name + " file was provided\n"); } else { @@ -37,7 +30,7 @@ void check_arg_file_exists(po::variables_map &vm_in, std::string name) { } } -void add_default_options(po::options_description &desc) { +void test_utils::add_default_options(po::options_description &desc) { desc.add_options()("help,h", "produce help message")( "xclbin,x", po::value()->required(), "the input xclbin path")( @@ -56,8 +49,9 @@ void add_default_options(po::options_description &desc) { "where to store trace output"); } -void parse_options(int argc, const char *argv[], po::options_description &desc, - po::variables_map &vm) { +void test_utils::parse_options(int argc, const char *argv[], + po::options_description &desc, + po::variables_map &vm) { try { po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); @@ -72,15 +66,19 @@ void parse_options(int argc, const char *argv[], po::options_description &desc, std::exit(1); } - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); + try { + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + } } // -------------------------------------------------------------------------- // AIE Specifics // -------------------------------------------------------------------------- -std::vector load_instr_sequence(std::string instr_path) { +std::vector test_utils::load_instr_sequence(std::string instr_path) { std::ifstream instr_file(instr_path); std::string line; std::vector instr_v; @@ -96,24 +94,63 @@ std::vector load_instr_sequence(std::string instr_path) { } // -------------------------------------------------------------------------- -// Matrix / Float / Math +// XRT // -------------------------------------------------------------------------- - -static inline std::int16_t random_int16_t() { - return (std::int16_t)rand() % 0x10000; +void test_utils::init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel, + int verbosity, std::string xclbinFileName, + std::string kernelNameInXclbin) { + // Get a device handle + unsigned int device_index = 0; + device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << xclbinFileName << "\n"; + auto xclbin = xrt::xclbin(xclbinFileName); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << kernelNameInXclbin << "\n"; + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = + *std::find_if(xkernels.begin(), xkernels.end(), + [kernelNameInXclbin, verbosity](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + if (verbosity >= 1) { + std::cout << "Name: " << name << std::endl; + } + return name.rfind(kernelNameInXclbin, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + // Register xclbin + if (verbosity >= 1) + std::cout << "Registering xclbin: " << xclbinFileName << "\n"; + + device.register_xclbin(xclbin); + + // Get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // Get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + kernel = xrt::kernel(context, kernelName); + + return; } -// static inline std::bfloat16_t random_bfloat16_t() { -// // Random numbers should NOT be uniformly between 0 and 1, because that -// // would make the matrix product AB always close to 1. -// return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); -// } +// -------------------------------------------------------------------------- +// Matrix / Float / Math +// -------------------------------------------------------------------------- // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0 // Original author: P-Gn // Source: https://stackoverflow.com/a/32334103 -bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, - float abs_th = FLT_MIN) +bool test_utils::nearly_equal(float a, float b, float epsilon, float abs_th) // those defaults are arbitrary and could be removed { assert(std::numeric_limits::epsilon() <= epsilon); @@ -131,76 +168,15 @@ bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, return diff < std::max(abs_th, epsilon * norm); } -template -void print_matrix(const std::vector matrix, int n_cols, - int n_printable_rows = 10, int n_printable_cols = 10, - std::ostream &ostream = std::cout, - const char col_sep[] = " ", const char elide_sym[] = " ... ", - int w = -1) { - assert(matrix.size() % n_cols == 0); - - auto maxima = std::minmax_element(matrix.begin(), matrix.end()); - T max_val = std::max(*maxima.first, std::abs(*maxima.second)); - size_t n_digits = log10(max_val); - if (w == -1) { - w = n_digits; - } - int n_rows = matrix.size() / n_cols; - - n_printable_rows = std::min(n_rows, n_printable_rows); - n_printable_cols = std::min(n_cols, n_printable_cols); - - const bool elide_rows = n_printable_rows < n_rows; - const bool elide_cols = n_printable_cols < n_cols; - - if (elide_rows || elide_cols) { - w = std::max((int)w, (int)strlen(elide_sym)); - } - - w += 3; // for decimal point and two decimal digits - ostream << std::fixed << std::setprecision(2); - -#define print_row(what) \ - for (int col = 0; col < n_printable_cols / 2; col++) { \ - ostream << std::right << std::setw(w) << (what); \ - ostream << std::setw(0) << col_sep; \ - } \ - if (elide_cols) { \ - ostream << std::setw(0) << elide_sym; \ - } \ - for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) { \ - ostream << std::right << std::setw(w) << (what); \ - ostream << std::setw(0) << col_sep; \ - } - - for (int row = 0; row < n_printable_rows / 2; row++) { - print_row(matrix[row * n_rows + col]); - ostream << std::endl; - } - if (elide_rows) { - print_row(elide_sym); - ostream << std::endl; - } - for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) { - print_row(matrix[row * n_rows + col]); - ostream << std::endl; - } - -#undef print_row -} - // -------------------------------------------------------------------------- // Tracing // -------------------------------------------------------------------------- -void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) { +void test_utils::write_out_trace(char *traceOutPtr, size_t trace_size, + std::string path) { std::ofstream fout(path); uint32_t *traceOut = (uint32_t *)traceOutPtr; for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) { fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i]; fout << std::endl; } -} - -} // namespace test_utils - -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/runtime_lib/test_lib/test_utils.h b/runtime_lib/test_lib/test_utils.h new file mode 100644 index 0000000000..ae094f0fcf --- /dev/null +++ b/runtime_lib/test_lib/test_utils.h @@ -0,0 +1,119 @@ +//===- test_utils.h ----------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// This file contains common helper functions for the generic host code + +#ifndef _TEST_UTILS_H_ +#define _TEST_UTILS_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace po = boost::program_options; + +namespace test_utils { + +void check_arg_file_exists(po::variables_map &vm_in, std::string name); + +void add_default_options(po::options_description &desc); + +void parse_options(int argc, const char *argv[], po::options_description &desc, + po::variables_map &vm); + +std::vector load_instr_sequence(std::string instr_path); + +void init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel, + int verbosity, std::string xclbinFileName, + std::string kernelNameInXclbin); + +static inline std::int16_t random_int16_t(); + +static inline std::bfloat16_t random_bfloat16_t(std::bfloat16_t scale, + std::bfloat16_t bias) { + return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) + bias); +} + +bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, + float abs_th = FLT_MIN); + +template +void print_matrix(const std::vector matrix, int n_cols, + int n_printable_rows = 10, int n_printable_cols = 10, + std::ostream &ostream = std::cout, + const char col_sep[] = " ", const char elide_sym[] = " ... ", + int w = -1) { + assert(matrix.size() % n_cols == 0); + + auto maxima = std::minmax_element(matrix.begin(), matrix.end()); + T max_val = std::max(*maxima.first, std::abs(*maxima.second)); + size_t n_digits = log10(max_val); + if (w == -1) { + w = n_digits; + } + int n_rows = matrix.size() / n_cols; + + n_printable_rows = std::min(n_rows, n_printable_rows); + n_printable_cols = std::min(n_cols, n_printable_cols); + + const bool elide_rows = n_printable_rows < n_rows; + const bool elide_cols = n_printable_cols < n_cols; + + if (elide_rows || elide_cols) { + w = std::max((int)w, (int)strlen(elide_sym)); + } + + w += 3; // for decimal point and two decimal digits + ostream << std::fixed << std::setprecision(2); + +#define print_row(what) \ + for (int col = 0; col < n_printable_cols / 2; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } \ + if (elide_cols) { \ + ostream << std::setw(0) << elide_sym; \ + } \ + for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) { \ + ostream << std::right << std::setw(w) << (what); \ + ostream << std::setw(0) << col_sep; \ + } + + for (int row = 0; row < n_printable_rows / 2; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + if (elide_rows) { + print_row(elide_sym); + ostream << std::endl; + } + for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) { + print_row(matrix[row * n_rows + col]); + ostream << std::endl; + } + +#undef print_row +} + +void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path); + +} // namespace test_utils + +#endif // _TEST_UTILS_H_ \ No newline at end of file diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit index 86b773fb81..ac347dcce6 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit +++ b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit @@ -8,4 +8,3 @@ // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! -` \ No newline at end of file