Add programming example with BD level syntax and init values (#1947)

Co-authored-by: AndraBisca <[email protected]> Co-authored-by: Joseph Melber <[email protected]> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Xilinx · Dec 6, 2024 · e462815 · e462815
1 parent ea74c75
commit e462815
Show file tree

Hide file tree

Showing 12 changed files with 840 additions and 2 deletions.
diff --git a/programming_examples/basic/vector_vector_add_BDs_init_values/CMakeLists.txt b/programming_examples/basic/vector_vector_add_BDs_init_values/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/vector_vector_add_BDs_init_values/Makefile b/programming_examples/basic/vector_vector_add_BDs_init_values/Makefile
@@ -0,0 +1,67 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include ${srcdir}/../../makefile-common
+
+targetname = vectorAdd
+devicename = npu
+col = 0
+
+aie_py_src=aie2.py
+use_alt?=0
+
+ifeq (${use_alt}, 1)
+aie_py_src=aie2_alt.py
+endif
+
+all: build/final.xclbin
+
+build/aie.mlir: ${srcdir}/${aie_py_src}
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--no-xchesscc --no-xbridge \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: ${srcdir}/test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+# Changing variables when we target VCK5000
+ACDC_AIE = $(dir $(shell which aie-opt))/..
+
+vck5000: devicename=xcvc1902
+vck5000: col=6
+
+vck5000: build/aie.mlir
+	aiecc.py \
+    --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
+		${srcdir}/test_vck5000.cpp \
+		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+run: ${targetname}.exe build/final.xclbin
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
diff --git a/programming_examples/basic/vector_vector_add_BDs_init_values/README.md b/programming_examples/basic/vector_vector_add_BDs_init_values/README.md
@@ -0,0 +1,59 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Vector Vector Add with BD-level Syntax and Initial Values</ins>
+
+A simple binary operator, which uses a single AIE core to get the addition of two vectors.  The overall vector size in this design is `256` and is processed by the core in smaller sub tiles of size `16`.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
+
+The kernel executes on AIE tile (`col`, 2). One input vector is brought into the tile from Shim tile (`col`, 0). The other input vector is initialized on the AIE tile directly with the full vector size. The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.
+
+The data movement in this design is decribed at BD-level in the DMA code regions of the AIE tile.
+
+## Source Files Overview
+
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+
+1. `test.cpp`: This C++ code is a testbench for the design example targeting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targeting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.
+
+## Ryzen™ AI Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make
+make vectorAdd.exe
+```
+
+To run the design:
+
+```
+make run
+```
+
+## VCK5000 Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make vck5000
+```
+
+To run the design:
+
+```
+./test.elf
+```
+
diff --git a/programming_examples/basic/vector_vector_add_BDs_init_values/aie2.py b/programming_examples/basic/vector_vector_add_BDs_init_values/aie2.py
@@ -0,0 +1,152 @@
+# vector_vector_add/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.dialects import memref
+
+
+def my_vector_add():
+    N = 256
+    n = 16
+    N_div_n = N // n
+
+    buffer_depth = 2
+
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
+        tile_ty = np.ndarray[(n,), np.dtype[np.int32]]
+
+        # Global symbols declaration
+        memref.global_("of_in1", T.memref(16, T.i32()), sym_visibility="public")
+        memref.global_("of_out", T.memref(16, T.i32()), sym_visibility="public")
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # ComputeTile2 elements
+        # First input vector from ShimTile
+        in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1)
+        in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0)
+        in1_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tile_ty,
+            name="in1_cons_buff_0",
+        )
+        # Second input vector, initialized on ComputeTile2
+        in2_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=0)
+        in2_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=1)
+        in2_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty,
+            name="in2_cons_buff_0",
+            initial_value=np.arange(N, dtype=np.int32),
+        )
+
+        # Output to ShimTile
+        out_prod_lock = lock(ComputeTile2, lock_id=4, init=1)
+        out_cons_lock = lock(ComputeTile2, lock_id=5, init=0)
+        out_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tile_ty,
+            name="out_buff_0",
+        )
+
+        # AIE-array data movement
+        flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0)
+        flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0)
+
+        # ComputeTile DMA configuration
+        @mem(ComputeTile2)
+        def m(block):
+            # channel allocation in S2MM direction, channel index 0
+            s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2])
+            # BD chains are assigned to a channel as well, where the last BD is
+            # either another channel allocation or the end BD
+            with block[1]:
+                # wait on lock acquire
+                use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual)
+                # receive incoming data in in1_cons_buff_0 buffer
+                dma_bd(in1_cons_buff_0)
+                # release lock
+                use_lock(in1_cons_cons_lock, LockAction.Release)
+                # BD loops forever on itself
+                next_bd(block[1])
+            with block[2]:
+                # channel allocation in MM2S direction, channel index 0
+                s1 = dma_start(DMAChannelDir.MM2S, 0, dest=block[3], chain=block[4])
+                # BD chains are assigned to a channel as well, where the last BD is
+                # either another channel allocation or the end BD
+            with block[3]:
+                # wait on lock acquire
+                use_lock(out_cons_lock, LockAction.AcquireGreaterEqual)
+                # output data from out_buff_0 buffer
+                dma_bd(out_buff_0)
+                # release lock
+                use_lock(out_prod_lock, LockAction.Release)
+                # BD loops forever on itself
+                next_bd(block[3])
+            with block[4]:
+                EndOp()
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in range_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                use_lock(in2_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                for j in range_(N_div_n):
+                    use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                    use_lock(out_prod_lock, LockAction.AcquireGreaterEqual)
+                    for i in range_(n):
+                        out_buff_0[i] = (
+                            in2_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i]
+                        )
+                    use_lock(in1_cons_prod_lock, LockAction.Release)
+                    use_lock(out_cons_lock, LockAction.Release)
+                use_lock(in2_cons_prod_lock, LockAction.Release)
+
+        # Allocation information for to/from AIE-array data movement (typically generated by objectfifos)
+        shim_dma_allocation("of_in1", DMAChannelDir.MM2S, 0, 0)
+        shim_dma_allocation("of_out", DMAChannelDir.S2MM, 0, 0)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="of_in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="of_out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            # of_out will only complete after of_in completes, so we just wait on of_out instead of both
+            dma_wait("of_out")
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_add()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)