Skip to content

Commit

Permalink
Using objectFifo link to access the shared memory between compute til…
Browse files Browse the repository at this point in the history
…es (#1814)
  • Loading branch information
pvasireddy-amd authored Nov 15, 2024
1 parent 232bff6 commit 3c91dcf
Show file tree
Hide file tree
Showing 12 changed files with 923 additions and 55 deletions.
60 changes: 51 additions & 9 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,40 @@ struct AIEObjectFifoStatefulTransformPass
}
}

// Only test for this objfifo belonging to a LinkOp if we are in the shared
// memory case; otherwise, we will return `true` in any case.
// Check if the objectfifo operation can use shared memory for linking. If
// the link operation is a distribute or a join operation, or if the link
// has different memref types, DMAs are required even if shared memory is
// available and the objectfifo should be split. Otherwise also check if the
// via_shared_memory attribute of the objectfifo operation is set and try to
// apply it.
if (hasSharedMemory) {
if (auto linkOp = getOptionalLinkOp(createOp)) {
splitBecauseLink.push_back(createOp);
isUsedInLinkOp = true;
int share_dir = 0;
if (!linkOp->isDistribute() && !linkOp->isJoin()) {
auto fifoInType = llvm::cast<AIEObjectFifoType>(
linkOp->getInputObjectFifos()[0].getElemType());
auto producerType =
llvm::cast<MemRefType>(fifoInType.getElementType());
auto fifoOutType = llvm::cast<AIEObjectFifoType>(
linkOp->getOutputObjectFifos()[0].getElemType());
auto consumerType =
llvm::cast<MemRefType>(fifoOutType.getElementType());
if (consumerType != producerType) {
// TODO: Support for different memref types through shared
// memory without DMAs
splitBecauseLink.push_back(createOp);
}
if (createOp.getViaSharedMem().has_value()) {
checkAndApplyViaSharedMemAttribute(createOp, share_dir);
if (share_direction == share_dir)
isUsedInLinkOp = false;
else
splitBecauseLink.push_back(createOp);
}
} else {
splitBecauseLink.push_back(createOp);
}
}
}

Expand Down Expand Up @@ -1734,17 +1762,31 @@ struct AIEObjectFifoStatefulTransformPass
//===----------------------------------------------------------------===//
coreOp.walk([&](ObjectFifoSubviewAccessOp accessOp) {
auto acqOp = accessOp.getSubview().getDefiningOp<ObjectFifoAcquireOp>();
if (ObjectFifoCreateOp op = acqOp.getObjectFifo();
getOptionalLinkOp(op)) {
accessOp->emitOpError("currently cannot access objectFifo used in "
"ObjectFifoLinkOp");
return;
if (ObjectFifoCreateOp op = acqOp.getObjectFifo()) {
if (auto linkOp = getOptionalLinkOp(op); linkOp.has_value()) {
if (!linkOp->isDistribute() && !linkOp->isJoin()) {
for (auto consumerTile : op.getConsumerTiles()) {
if (auto consumerTileOp =
dyn_cast<TileOp>(consumerTile.getDefiningOp())) {
int share_dir_value = 0;
bool sharing = isSharedMemory(
op.getProducerTileOp(), consumerTileOp, &share_dir_value);
if (!sharing)
accessOp->emitOpError(
"currently cannot access objectFifo used in "
"ObjectFifoLinkOp if the tiles don't share memory");
}
}
} else
accessOp->emitOpError(
"currently cannot access objectFifo used in "
"ObjectFifoLinkOp if it is a distribute or join link");
}
}
accessOp.getOutput().replaceAllUsesWith(
subviews[acqOp][accessOp.getIndex()]->getBuffer());
});
}

// make global symbols to replace the to be erased ObjectFifoCreateOps
for (auto createOp : device.getOps<ObjectFifoCreateOp>()) {
builder.setInsertionPointToStart(&device.getBodyRegion().front());
Expand Down
66 changes: 66 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
#
##===----------------------------------------------------------------------===##

# ---

# The following environment variables that point to the Xilinx runtime (XRT)
# should be set up by an environment setup script already.
XILINX_XRT?=/opt/xilinx/xrt
XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)

# ---

srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
XILINX_XRT_LIB?=${XILINX_XRT}/lib

CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include
XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
XRT_LIBS=-lxrt_coreutil
CXX=g++-13 -ggdb

#mlir_target?=build/aie.mlir
xclbin_target?=build/final.xclbin
insts_target?=build/insts.txt
host_target?=build/test

.PHONY: all
all: ${xclbin_target} ${host_target}

build/aie.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
python3 $< > $@

build/kernel.o: ${srcdir}/kernel.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}

${xclbin_target}: build/aie.mlir build/kernel.o
mkdir -p ${@D}
cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}

${host_target}: ${srcdir}/test.cpp ${xclbin_target}
mkdir -p ${@D}
${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}

.PHONY: run
run: ${host_target}
./${host_target}

xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh
.PHONY: sign
sign: ${xclbin_target}
${xclbin_sign} -dev Phoenix -xclbin $<

.PHONY: clean
clean:
-rm -r build
73 changes: 73 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

# REQUIRES: ryzen_ai, valid_xchess_license
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!
import numpy as np

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_
from aie.extras.context import mlir_mod_ctx

N = 50
O = 250
n_rows = 5
dev = AIEDevice.npu1_1col
col = 0


def nested_loops():
with mlir_mod_ctx() as ctx:

@device(dev)
def device_body():
tensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]

# Tile declarations
ShimTile = tile(col, 0)
ComputeTile = tile(col, 2)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)

# AIE Core Function declarations
passthrough_10_i32 = external_func(
"passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
)

# Set up compute tiles
@core(ComputeTile, "kernel.o")
def core_body():
for _ in range_(5):
elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
for _ in range_(5):
elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
passthrough_10_i32(elemIn, elemOut)
of_out.release(ObjectFifoPort.Produce, 1)
of_in.release(ObjectFifoPort.Consume, 1)

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty)
def sequence(A, C):
npu_dma_memcpy_nd(
metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True
)
npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, O])
dma_wait(of_in, of_out)

print(ctx.module)


nested_loops()
22 changes: 22 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <aie_api/aie.hpp>

template <typename T_in, typename T_out, unsigned long N>
void passthrough(const T_in *__restrict in, T_out *__restrict out) {
for (int i = 0; i < N; i++) {
out[i] = in[i];
}
}

extern "C" {

void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
passthrough<int, int, 10>(in, out);
}
}
139 changes: 139 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <cassert>
#include <cstring>
#include <fstream>
#include <iomanip>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#ifndef XCLBIN
#define XCLBIN "build/final.xclbin"
#endif

#ifndef INSTS_TXT
#define INSTS_TXT "build/insts.txt"
#endif

#ifndef KERNEL_NAME
#define KERNEL_NAME "MLIR_AIE"
#endif

#define INPUT_SIZE (50 * sizeof(int)) // in bytes
#define OUTPUT_SIZE (250 * sizeof(int)) // in bytes
#define WIDTH_SIZE (10 * sizeof(int)) // in bytes
#define WIDTH 10
#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE

std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
std::ifstream instr_file(instr_path);
std::string line;
std::vector<uint32_t> instr_v;
while (std::getline(instr_file, line)) {
std::istringstream iss(line);
uint32_t a;
if (!(iss >> std::hex >> a)) {
throw std::runtime_error("Unable to parse instruction file\n");
}
instr_v.push_back(a);
}
return instr_v;
}

int main(int argc, const char *argv[]) {

std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
assert(instr_v.size() > 0);

// Get a device handle
unsigned int device_index = 0;
xrt::device device = xrt::device(device_index);

// Load the xclbin
xrt::xclbin xclbin = xrt::xclbin(XCLBIN);

// Get the kernel from the xclbin
std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
xrt::xclbin::kernel xkernel = *std::find_if(
xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
return k.get_name().rfind(KERNEL_NAME, 0) == 0;
});
std::string kernel_name = xkernel.get_name();
assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);

device.register_xclbin(xclbin);

// get a hardware context
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
auto kernel = xrt::kernel(context, kernel_name);

auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_input =
xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_output =
xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));

int *buf_input = bo_input.map<int *>();
std::cout << std::endl << std::endl << "Input: " << std::endl;
for (int i = 0; i < INPUT_ROWS; i++) {
std::cout << "row " << i << " : ";
for (int j = 0; j < WIDTH; j++) {
buf_input[i * WIDTH + j] = i;
std::cout << buf_input[i * WIDTH + j] << " ";
}
std::cout << std::endl << std::endl;
}
int *buf_output = bo_output.map<int *>();
memset(buf_output, 0, OUTPUT_SIZE);

// Instruction buffer for DMA configuration
void *buf_instr = bo_instr.map<void *>();
memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));

bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);

unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
ert_cmd_state r = run.wait();
if (r != ERT_CMD_STATE_COMPLETED) {
std::cout << "Kernel did not complete. Returned status: " << r << "\n";
return 1;
}

bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

bool pass = true;
std::cout << std::endl << "Output: " << std::endl;
int expected_output = 0;
int five_repetitions = 0;
for (int i = 0; i < OUTPUT_ROWS; i++) {
std::cout << "row " << i << std::endl;
if (five_repetitions == 5) {
expected_output++;
five_repetitions = 0;
}
for (int j = 0; j < WIDTH; j++) {
std::cout << "expected: " << expected_output << ", ";
std::cout << "got: " << buf_output[i * WIDTH + j] << std::endl;
pass &= buf_output[i * WIDTH + j] == expected_output;
}
std::cout << std::endl << std::endl;
five_repetitions++;
}
std::cout << std::endl << std::endl;
std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;

return 0;
}
Loading

0 comments on commit 3c91dcf

Please sign in to comment.