Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ASPLOS][WIP] Passthrough kernel in basic examples #1216

Merged
merged 9 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,17 @@
#include <stdio.h>
#include <stdlib.h>

#define REL_WRITE 0
#define REL_READ 1

#include <aie_api/aie.hpp>

template <typename T, int N>
__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
const int32_t height,
const int32_t width) {
//::aie::vector<T, N> data_out;
//::aie::mask<N> temp_val;
v64uint8 *restrict outPtr = (v64uint8 *)out;
v64uint8 *restrict inPtr = (v64uint8 *)in;

for (int j = 0; j < (height * width); j += N) // Nx samples per loop
chess_prepare_for_pipelining chess_loop_range(6, ) {
//::aie::vector<T, N> tmpVector = ::aie::load_v(in);
//::aie::store_v(out, tmpVector);

*outPtr++ = *inPtr++;

// in += N;
// out += N;
}
chess_prepare_for_pipelining chess_loop_range(6, ) { *outPtr++ = *inPtr++; }
}

extern "C" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ include ../../makefile-common

SHELL := /bin/bash

targetname = passThroughHardware
targetname = passThroughDMAs
LENGTH ?= 4096

all: build/final.xclbin build/insts.txt
Expand Down Expand Up @@ -47,4 +47,4 @@ run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -l ${LENGTH}

clean:
rm -rf build _build inst
rm -rf build _build inst ${targetname}.exe
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from aie.extras.context import mlir_mod_ctx

N = 4096
N_in_bytes = N * 4

if len(sys.argv) == 2:
N = int(sys.argv[1])
Expand All @@ -41,9 +40,8 @@ def device_body():
# Compute tile 2
@core(ComputeTile2)
def core_body():
tmp = memref.alloc(1, T.i32())
v0 = arith.constant(0, T.i32())
memref.store(v0, tmp, [0])
for _ in for_(sys.maxsize):
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())
Expand Down
75 changes: 75 additions & 0 deletions programming_examples/basic/passthrough_kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Xilinx Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif ()

set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC
PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
DISABLE_ABI_CHECK=1
)

target_include_directories (${currentTarget} PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../utils
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
49 changes: 49 additions & 0 deletions programming_examples/basic/passthrough_kernel/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../../makefile-common

VPATH := ../../../aie_kernels/aie_generic

PASSTHROUGH_SIZE = 4096

targetname = passThroughKernel

.PHONY: all template clean

all: build/final_${PASSTHROUGH_SIZE}.xclbin

build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
mkdir -p ${@D}
python3 $< ${PASSTHROUGH_SIZE} > $@

build/passThrough.cc.o: passThrough.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}

build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE

clean:
rm -rf build _build ${targetname}.exe
170 changes: 170 additions & 0 deletions programming_examples/basic/passthrough_kernel/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.context import mlir_mod_ctx

N = 1024

if len(sys.argv) == 2:
N = int(sys.argv[1])

lineWidthInBytes = N
lineWidthInInt32s = lineWidthInBytes // 4

enableTrace = False
traceSizeInBytes = 8192
traceSizeInInt32s = traceSizeInBytes // 4


def passthroughKernel():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
# define types
memRef_ty = T.memref(lineWidthInBytes, T.ui8())

# AIE Core Function declarations
passThroughLine = external_func(
"passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()]
)

# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

if enableTrace:
flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2, "passThrough.cc.o")
def core_body():
for _ in for_(sys.maxsize):
elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
call(passThroughLine, [elemIn, elemOut, lineWidthInBytes])
of_in.release(ObjectFifoPort.Consume, 1)
of_out.release(ObjectFifoPort.Produce, 1)
yield_([])

# print(ctx.module.operation.verify())

tensorSize = N
tensorSizeInInt32s = tensorSize // 4
tensor_ty = T.memref(lineWidthInInt32s, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
if enableTrace:
# Trace output

# Trace_Event0, Trace_Event1: Select which events to trace.
# Note that the event buffers only appear to be transferred to DDR in
# bursts of 256 bytes. If less than 256 bytes are written, you may not
# see trace output, or only see it on the next iteration of your
# kernel invocation, as the buffer gets filled up. Note that, even
# though events are encoded as 4 byte words, it may take more than 64
# events to fill the buffer to 256 bytes and cause a flush, since
# multiple repeating events can be 'compressed' by the trace mechanism.
# In order to always generate sufficient events, we add the "assert
# TRUE" event to one slot, which fires every cycle, and thus fills our
# buffer quickly.

# Some events:
# TRUE (0x01)
# STREAM_STALL (0x18)
# LOCK_STALL (0x1A)
# EVENTS_CORE_INSTR_EVENT_1 (0x22)
# EVENTS_CORE_INSTR_EVENT_0 (0x21)
# INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction
# INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction
# INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction
# EVENTS_CORE_PORT_RUNNING_1 (0x4F)
# EVENTS_CORE_PORT_RUNNING_0 (0x4B)

# Trace_Event0 (4 slots)
IpuWrite32(0, 2, 0x340E0, 0x4B222125)
# Trace_Event1 (4 slots)
IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)

# Event slots as configured above:
# 0: Kernel executes vector instruction
# 1: Event 0 -- Kernel starts
# 2: Event 1 -- Kernel done
# 3: Port_Running_0
# 4: Port_Running_1
# 5: Lock Stall
# 6: Lock Acquire Instr
# 7: Lock Release Instr

# Stream_Switch_Event_Port_Selection_0
# This is necessary to capture the Port_Running_0 and Port_Running_1 events
IpuWrite32(0, 2, 0x3FF00, 0x121)

# Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
IpuWrite32(0, 2, 0x340D0, 0x10000)

# Start trace copy out.
IpuWriteBdShimTile(
bd_id=3,
buffer_length=traceSizeInBytes,
buffer_offset=tensorSize,
enable_packet=0,
out_of_order_id=0,
packet_id=0,
packet_type=0,
column=0,
column_num=1,
d0_stride=0,
d0_wrap=0,
d1_stride=0,
d1_wrap=0,
d2_stride=0,
ddr_id=2,
iteration_current=0,
iteration_stride=0,
iteration_wrap=0,
lock_acq_enable=0,
lock_acq_id=0,
lock_acq_val=0,
lock_rel_id=0,
lock_rel_val=0,
next_bd=0,
use_next_bd=0,
valid_bd=1,
)
IpuWrite32(0, 0, 0x1D20C, 0x3)

ipu_dma_memcpy_nd(
metadata="in",
bd_id=0,
mem=inTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_dma_memcpy_nd(
metadata="out",
bd_id=1,
mem=outTensor,
sizes=[1, 1, 1, tensorSizeInInt32s],
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


passthroughKernel()
denolf marked this conversation as resolved.
Show resolved Hide resolved
denolf marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 12 additions & 0 deletions programming_examples/basic/passthrough_kernel/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, chess
//
// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o
// RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../utils %S/../../utils/xrtUtils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
// CHECK: PASS!

Loading
Loading