Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dma_task in programming examples #1919

Merged
merged 48 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
b6d7180
Start to port programming examples to use dma task
hunhoffe Nov 13, 2024
49fa9e1
remove unneeded field
hunhoffe Nov 13, 2024
b43a48d
Finish adding alternate (dma task) impls of programming_examples/vision
hunhoffe Nov 13, 2024
fb59e20
Add alt version for ml programming examples
hunhoffe Nov 13, 2024
bbc991f
Add convenience wrappers around dma_*_task functions
hunhoffe Nov 13, 2024
2bfb325
Start porting some of the basic examples to use the dma task structure
hunhoffe Nov 13, 2024
88a033f
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 13, 2024
774e0e6
Finish rewriting programming examples to use dma task
hunhoffe Nov 13, 2024
f95bf36
Fix for [1, 1, 1, N]
jgmelber Nov 14, 2024
174cdf0
Additional verification linear case patch
jgmelber Nov 14, 2024
bbcde4c
Default sizes to 1
jgmelber Nov 14, 2024
0d86176
Use uint32_t for sizes to match transfer length for dim 0
jgmelber Nov 14, 2024
c7f38ec
Apply suggestions from code review
jgmelber Nov 14, 2024
daa4598
Revert "Default sizes to 1"
jgmelber Nov 14, 2024
259b2a6
Init sizes for vec scalar mul
jgmelber Nov 14, 2024
10b386b
calculate transfer len with less lines of code
hunhoffe Nov 14, 2024
d3ee4e6
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 14, 2024
efbff0e
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 14, 2024
ee0fa1a
Remove lingering npu_dma_memcpy_nd from alt examples
hunhoffe Nov 14, 2024
5931e38
Attempt to use repeat count correctly in examples
hunhoffe Nov 15, 2024
22ec2f0
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 15, 2024
bdc73f0
Does not fix things, but update understanding of repeat count
hunhoffe Nov 15, 2024
2837740
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 19, 2024
bb39927
Add large linear transfer test for large linear transfer (size used i…
hunhoffe Nov 19, 2024
6bb74fc
Fix minor errors with some alt examples
hunhoffe Nov 19, 2024
d0e4997
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 19, 2024
84ec074
Reduce diff between normal and alt version
hunhoffe Nov 19, 2024
339592f
Fix a few more typos
hunhoffe Nov 19, 2024
f38cdbe
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 20, 2024
6873582
Do not check for linear transfer until after setting sizes
hunhoffe Nov 20, 2024
1d09301
Zero out sides/strides for linear transfer
hunhoffe Nov 20, 2024
79b1e47
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 20, 2024
99fab89
Some prep for larger lens for DMABDOps
hunhoffe Nov 20, 2024
7cb6228
Try fixing vector exp build error
hunhoffe Nov 20, 2024
416af28
matrix vector working locally
hunhoffe Nov 20, 2024
3611676
Revert vector exp change
hunhoffe Nov 20, 2024
6256719
Another attempt to fix vector exp
hunhoffe Nov 20, 2024
384e213
small fix to cascade alt design
hunhoffe Nov 20, 2024
e5b6f10
Small fix, cascade working locally
hunhoffe Nov 20, 2024
d29571a
Start porting examples to use helper function
hunhoffe Nov 20, 2024
dc28e5c
Continue porting examples to use helper function
hunhoffe Nov 21, 2024
cdd28ec
Finish porting basic alt examples to use helper function
hunhoffe Nov 21, 2024
6d38385
Continue fixing up examples
hunhoffe Nov 22, 2024
8e96a57
Finished cleaning up alt examples
hunhoffe Nov 22, 2024
a5eda19
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 22, 2024
4e11994
Add some documentation to the programming guide regarding DMA task op…
hunhoffe Nov 22, 2024
15649f9
Commit improvements to dma_task section of programming guide
hunhoffe Nov 22, 2024
4727379
Minor formatting fixes in section-2g
hunhoffe Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/aie/Dialect/AIE/IR/AIEAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
}];

let parameters = (ins
"uint16_t" : $size,
"uint32_t" : $size,
"uint32_t" : $stride
);

Expand Down
4 changes: 2 additions & 2 deletions include/aie/Dialect/AIE/IR/AIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -898,8 +898,8 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
int32_t getBufferElementTypeWidthInBytes() {
return getBuffer().getType().getElementTypeBitWidth() / 8;
}
int32_t getLenInBytes() {
if (std::optional<int32_t> len = getLen(); len.has_value())
uint32_t getLenInBytes() {
if (std::optional<uint32_t> len = getLen(); len.has_value())
return len.value() * getBufferElementTypeWidthInBytes();
else
return getBuffer().getType().getNumElements() * getBufferElementTypeWidthInBytes();
Expand Down
4 changes: 3 additions & 1 deletion lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,11 @@ LogicalResult AIEX::NpuPushQueueOp::verify() {
LogicalResult AIEX::NpuWriteBdOp::verify() {
const auto &targetModel = AIE::getTargetModel(*this);
auto numBds = targetModel.getNumBDs(getColumn(), getRow());
bool isLinearTransfer =
(getD0Size() >= 1) && (getD1Size() == 1) && (getIterationSize() == 0);
if (getBdId() > numBds)
return emitOpError("BD ID exceeds the maximum ID.");
if (getD0Size() > 0x3FF)
if (!isLinearTransfer && getD0Size() > 0x3FF)
return emitOpError("D0 Size exceeds the [0:1023] range.");
if (getD0Stride() > 0xFFFFF)
return emitOpError("D0 Stride exceeds the [0:1M-1] range.");
Expand Down
62 changes: 51 additions & 11 deletions lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {

uint32_t bd_id = bd_op.getBdId().value();
int64_t offset = bd_op.getOffsetInBytes();
uint32_t len = bd_op.getLenInBytes();
uint32_t len_addr_granularity = len * 8 / addr_granularity;
uint64_t len = bd_op.getLenInBytes();
uint64_t len_addr_granularity = len * 8 / addr_granularity;

if (offset * 8 % addr_granularity != 0) {
return bd_op->emitOpError("Offset must be aligned to ")
Expand All @@ -253,7 +253,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
llvm::SmallVector<int64_t, 4>(4, 0);
std::fill(padBefore.begin(), padBefore.end(), 0);
std::fill(padAfter.begin(), padAfter.end(), 0);
int d2size = 0;

auto d0size = 0;
auto d0stride = 0;
auto d1size = 0;
auto d1stride = 0;
auto d2size = 0;
auto d2stride = 0;
auto iteration_size = 0;
auto iteration_stride = 0;

if (dims && dims->size() > 0) {
llvm::SmallVector<int64_t, 4> input_sizes =
Expand All @@ -273,6 +281,12 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
input_sizes[i] = (*dims)[j].getSize();
input_strides[i] = (*dims)[j].getStride();
}

// Do not check input_sizes[3] because a repeat can still be considered a
// linear transfer
bool isLinearTransfer = (input_sizes[0] >= 1) && (input_sizes[1] == 1) &&
(input_sizes[2] == 1);

if (dims->size() > 2) {
d2size = (target_model.isMemTile(tile.getCol(), tile.getRow()))
? (*dims)[2].getSize()
Expand Down Expand Up @@ -302,16 +316,43 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
}
getHardwareStridesWraps(target_model, buffer_type, input_sizes,
input_strides, sizes, strides);

if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
tile.getRow(), input_sizes, input_strides,
sizes, strides))) {
sizes, strides, isLinearTransfer))) {
return failure();
}

iteration_size = sizes[3];
iteration_stride = strides[3];

if (!isLinearTransfer) {
// d0_size, d0_stride
d0size = sizes[0];
d0stride = strides[0];

// d1_size, d1_stride
d1size = sizes[1];
d1stride = strides[1];

// d2_stride
d2stride = strides[2];
// d2_size set elsewhere
}
if (input_sizes[3] > 1 && input_strides[3] == 0) {
// We allow users to encode the repeat_count as a dimension 3 stride
// of 0. This must lower to a iteration wrap of 0, so no stride is
// ever added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = 0;
iteration_stride = 0;
}

// Ensure the total transfer length and the length expressed in the lowest
// three dimensions of strides/wraps agree. (Fourth dimension is
// iteration/repeat count and repeats the whole BD, so should not be
// incorporated in length of a single BD invocation.)
uint32_t len_dims_addr_granularity = 1;
uint64_t len_dims_addr_granularity = 1;
for (size_t i = 0; i < 3; i++) {
len_dims_addr_granularity *= sizes[i];
}
Expand Down Expand Up @@ -352,11 +393,11 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
bd_op.getLoc(), tile.getCol(), bd_id, len_addr_granularity, offset, 0,
0, 0, 0,
/* TODO: Strides/Wraps */
/*d0_size=*/sizes[0], /*d0_stride=*/strides[0],
/*d1_size=*/sizes[1], /*d1_stride=*/strides[1],
/*d2_size=*/d2size, /*d2_stride=*/strides[2],
/*iteration_current=*/0, /*iteration_size=*/sizes[3],
/*iteration_stride=*/strides[3],
/*d0_size=*/d0size, /*d0_stride=*/d0stride,
/*d1_size=*/d1size, /*d1_stride=*/d1stride,
/*d2_size=*/d2size, /*d2_stride=*/d2stride,
/*iteration_current=*/0, /*iteration_size=*/iteration_size,
/*iteration_stride=*/iteration_stride,
/* TODO: Next BD */
/*next_bd=*/next_bd_id,
/*row=*/tile.getRow(),
Expand All @@ -368,7 +409,6 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
/*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2],
/*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
/*d2_zero_after=*/padAfter[2]);

return setAddressForSingleBD(builder, bd_op, tile);
}

Expand Down
9 changes: 8 additions & 1 deletion programming_examples/basic/dma_transpose/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@ targetname = dmaTranspose
M ?= 64
K ?= 32

build/aie.mlir: ${srcdir}/aie2.py
aie_py_src=aie2.py
use_alt?=0

ifeq (${use_alt}, 1)
aie_py_src=aie2_alt.py
endif

build/aie.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${M} ${K} > $@

Expand Down
89 changes: 89 additions & 0 deletions programming_examples/basic/dma_transpose/aie2_alt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# dma_transpose/aie2.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import argparse
import numpy as np
import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx
from aie.helpers.dialects.ext.scf import _for as range_
from aie.helpers.tensortiler import TensorTile


def my_passthrough(M, K, N, generate_access_map=False):
tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
data_transform = TensorTile(
(M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
)
if generate_access_map:
data_transform.visualize(
show_arrows=True, plot_access_count=False, file_path="transpose_data.png"
)
return

with mlir_mod_ctx() as ctx:

@device(AIEDevice.npu1_1col)
def device_body():
# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, tensor_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, tensor_ty)
object_fifo_link(of_in, of_out)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
for _ in range_(sys.maxsize):
pass

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
in_task = shim_dma_single_bd_task(
of_in, A, tensor_tile=data_transform, issue_token=True
)
out_task = shim_dma_single_bd_task(
of_out, C, sizes=[1, 1, 1, N], issue_token=True
)

dma_start_task(in_task, out_task)
dma_await_task(in_task, out_task)

print(ctx.module)


if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("dims", help="M K", type=int, nargs="*", default=[64, 64])
p.add_argument(
"--generate-access-map",
action="store_true",
help="Produce a file showing data access order",
)
args = p.parse_args()

if len(args.dims) != 2:
print(
"ERROR: Must provide either no dimensions or both M and K", file=sys.stderr
)
exit(-1)
my_passthrough(
M=args.dims[0],
K=args.dims[1],
N=args.dims[0] * args.dims[1],
generate_access_map=args.generate_access_map,
)
12 changes: 12 additions & 0 deletions programming_examples/basic/dma_transpose/run_makefile_alt.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: mkdir -p test_alt
// RUN: cd test_alt
// RUN: make -f %S/Makefile clean
// RUN: env use_alt=1 make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!

Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ n_aie_cols?=4
kernels=mm_${m}x${k}x${n}
aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols}
target_suffix=${M}x${K}x${N}_${m}x${k}x${n}_${n_aie_cols}c
use_alt?=0

ifeq (${use_alt}, 1)
aie_py_src=aie2_alt.py
endif

include ${srcdir}/../makefile-common

Expand Down
Loading
Loading