Skip to content

Commit

Permalink
Add programming example with BD level syntax and init values (#1947)
Browse files Browse the repository at this point in the history
Co-authored-by: AndraBisca <[email protected]>
Co-authored-by: Joseph Melber <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Dec 6, 2024
1 parent ea74c75 commit e462815
Show file tree
Hide file tree
Showing 12 changed files with 840 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED YES)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(CMAKE_C_COMPILER gcc-13)
set(CMAKE_CXX_COMPILER g++-13)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif()

set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)

target_include_directories (${currentTarget} PUBLIC
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
#
##===----------------------------------------------------------------------===##

srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

include ${srcdir}/../../makefile-common

targetname = vectorAdd
devicename = npu
col = 0

aie_py_src=aie2.py
use_alt?=0

ifeq (${use_alt}, 1)
aie_py_src=aie2_alt.py
endif

all: build/final.xclbin

build/aie.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${devicename} ${col} > $@

build/final.xclbin: build/aie.mlir
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
--no-xchesscc --no-xbridge \
--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}

${targetname}.exe: ${srcdir}/test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

# Changing variables when we target VCK5000
ACDC_AIE = $(dir $(shell which aie-opt))/..

vck5000: devicename=xcvc1902
vck5000: col=6

vck5000: build/aie.mlir
aiecc.py \
--link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
${srcdir}/test_vck5000.cpp \
${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf

run: ${targetname}.exe build/final.xclbin
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE

clean:
rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<!---//===- README.md --------------------------*- Markdown -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//-->

# <ins>Vector Vector Add with BD-level Syntax and Initial Values</ins>

A simple binary operator, which uses a single AIE core to get the addition of two vectors. The overall vector size in this design is `256` and is processed by the core in smaller sub tiles of size `16`. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000.

The kernel executes on AIE tile (`col`, 2). One input vector is brought into the tile from Shim tile (`col`, 0). The other input vector is initialized on the AIE tile directly with the full vector size. The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.

The data movement in this design is decribed at BD-level in the DMA code regions of the AIE tile.

## Source Files Overview

1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI).

1. `test.cpp`: This C++ code is a testbench for the design example targeting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.

1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targeting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.

## Ryzen™ AI Usage

### C++ Testbench

To compile the design and C++ testbench:

```
make
make vectorAdd.exe
```

To run the design:

```
make run
```

## VCK5000 Usage

### C++ Testbench

To compile the design and C++ testbench:

```
make vck5000
```

To run the design:

```
./test.elf
```

152 changes: 152 additions & 0 deletions programming_examples/basic/vector_vector_add_BDs_init_values/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# vector_vector_add/aie2.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import numpy as np
import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx
from aie.helpers.dialects.ext.scf import _for as range_
from aie.dialects import memref


def my_vector_add():
N = 256
n = 16
N_div_n = N // n

buffer_depth = 2

if len(sys.argv) != 3:
raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")

if sys.argv[1] == "npu":
dev = AIEDevice.npu1_1col
elif sys.argv[1] == "xcvc1902":
dev = AIEDevice.xcvc1902
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))

@device(dev)
def device_body():
tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
tile_ty = np.ndarray[(n,), np.dtype[np.int32]]

# Global symbols declaration
memref.global_("of_in1", T.memref(16, T.i32()), sym_visibility="public")
memref.global_("of_out", T.memref(16, T.i32()), sym_visibility="public")

# Tile declarations
ShimTile = tile(int(sys.argv[2]), 0)
ComputeTile2 = tile(int(sys.argv[2]), 2)

# ComputeTile2 elements
# First input vector from ShimTile
in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1)
in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0)
in1_cons_buff_0 = buffer(
tile=ComputeTile2,
datatype=tile_ty,
name="in1_cons_buff_0",
)
# Second input vector, initialized on ComputeTile2
in2_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=0)
in2_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=1)
in2_cons_buff_0 = buffer(
tile=ComputeTile2,
datatype=tensor_ty,
name="in2_cons_buff_0",
initial_value=np.arange(N, dtype=np.int32),
)

# Output to ShimTile
out_prod_lock = lock(ComputeTile2, lock_id=4, init=1)
out_cons_lock = lock(ComputeTile2, lock_id=5, init=0)
out_buff_0 = buffer(
tile=ComputeTile2,
datatype=tile_ty,
name="out_buff_0",
)

# AIE-array data movement
flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0)
flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0)

# ComputeTile DMA configuration
@mem(ComputeTile2)
def m(block):
# channel allocation in S2MM direction, channel index 0
s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2])
# BD chains are assigned to a channel as well, where the last BD is
# either another channel allocation or the end BD
with block[1]:
# wait on lock acquire
use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual)
# receive incoming data in in1_cons_buff_0 buffer
dma_bd(in1_cons_buff_0)
# release lock
use_lock(in1_cons_cons_lock, LockAction.Release)
# BD loops forever on itself
next_bd(block[1])
with block[2]:
# channel allocation in MM2S direction, channel index 0
s1 = dma_start(DMAChannelDir.MM2S, 0, dest=block[3], chain=block[4])
# BD chains are assigned to a channel as well, where the last BD is
# either another channel allocation or the end BD
with block[3]:
# wait on lock acquire
use_lock(out_cons_lock, LockAction.AcquireGreaterEqual)
# output data from out_buff_0 buffer
dma_bd(out_buff_0)
# release lock
use_lock(out_prod_lock, LockAction.Release)
# BD loops forever on itself
next_bd(block[3])
with block[4]:
EndOp()

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in range_(sys.maxsize):
# Number of sub-vector "tile" iterations
use_lock(in2_cons_cons_lock, LockAction.AcquireGreaterEqual)
for j in range_(N_div_n):
use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual)
use_lock(out_prod_lock, LockAction.AcquireGreaterEqual)
for i in range_(n):
out_buff_0[i] = (
in2_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i]
)
use_lock(in1_cons_prod_lock, LockAction.Release)
use_lock(out_cons_lock, LockAction.Release)
use_lock(in2_cons_prod_lock, LockAction.Release)

# Allocation information for to/from AIE-array data movement (typically generated by objectfifos)
shim_dma_allocation("of_in1", DMAChannelDir.MM2S, 0, 0)
shim_dma_allocation("of_out", DMAChannelDir.S2MM, 0, 0)

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):
npu_dma_memcpy_nd(metadata="of_in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(metadata="of_out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
# of_out will only complete after of_in completes, so we just wait on of_out instead of both
dma_wait("of_out")


with mlir_mod_ctx() as ctx:
my_vector_add()
res = ctx.module.operation.verify()
if res == True:
print(ctx.module)
else:
print(res)
Loading

0 comments on commit e462815

Please sign in to comment.