From cb1d1fc75c9b736d0c9375cb5d9539d4795635c2 Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Tue, 10 Dec 2024 13:44:16 -0700 Subject: [PATCH] Fixing up naming and versions for a few more examples --- .../basic/dma_transpose/README.md | 8 +- .../matrix_multiplication/cascade/README.md | 4 +- .../matrix_vector/Makefile | 2 +- .../matrix_vector/README.md | 16 +-- .../single_core/Makefile.chess | 2 +- .../single_core/README.md | 20 +-- .../basic/passthrough_kernel/Makefile | 14 +- .../basic/passthrough_kernel/README.md | 33 ++--- .../basic/passthrough_kernel/aie2.py | 102 -------------- .../{aie2_iron.py => passthrough_kernel.py} | 2 +- ...{aie2_alt.py => passthrough_kernel_alt.py} | 2 +- .../passthrough_kernel/run_makefile_iron.lit | 10 -- .../basic/passthrough_pykernel/Makefile | 14 +- .../basic/passthrough_pykernel/README.md | 37 +++--- .../basic/passthrough_pykernel/aie2.py | 84 ------------ .../{aie2_iron.py => passthrough_pykernel.py} | 2 +- ...ie2_alt.py => passthrough_pykernel_alt.py} | 2 +- .../run_makefile_iron.lit | 12 -- .../basic/row_wise_bias_add/Makefile | 12 +- .../basic/row_wise_bias_add/README.md | 7 +- .../basic/row_wise_bias_add/aie2.py | 90 ------------- .../{aie2_iron.py => row_wise_bias_add.py} | 0 .../{aie2_alt.py => row_wise_bias_add_alt.py} | 0 .../row_wise_bias_add/run_makefile_iron.lit | 11 -- .../tiling_exploration/per_tile/Makefile | 5 +- .../tiling_exploration/per_tile/README.md | 8 +- .../per_tile/{aie2_iron.py => per_tile.py} | 2 +- .../tiling_exploration/tile_group/Makefile | 5 +- .../tiling_exploration/tile_group/README.md | 4 +- .../{aie2_iron.py => tile_group.py} | 2 +- .../basic/vector_exp/Makefile | 12 +- .../basic/vector_exp/README.md | 19 ++- programming_examples/basic/vector_exp/aie2.py | 125 ------------------ .../{aie2_iron.py => vector_exp.py} | 2 +- .../{aie2_alt.py => vector_exp_alt.py} | 2 +- .../basic/vector_reduce_add/aie2.py | 77 ----------- .../vector_reduce_add/run_makefile_iron.lit | 12 -- .../{aie2_iron.py => vector_reduce_add.py} | 0 .../{aie2_alt.py => vector_reduce_add_alt.py} | 0 39 files changed, 111 insertions(+), 650 deletions(-) delete mode 100755 programming_examples/basic/passthrough_kernel/aie2.py rename programming_examples/basic/passthrough_kernel/{aie2_iron.py => passthrough_kernel.py} (97%) rename programming_examples/basic/passthrough_kernel/{aie2_alt.py => passthrough_kernel_alt.py} (98%) delete mode 100644 programming_examples/basic/passthrough_kernel/run_makefile_iron.lit delete mode 100644 programming_examples/basic/passthrough_pykernel/aie2.py rename programming_examples/basic/passthrough_pykernel/{aie2_iron.py => passthrough_pykernel.py} (97%) rename programming_examples/basic/passthrough_pykernel/{aie2_alt.py => passthrough_pykernel_alt.py} (97%) delete mode 100644 programming_examples/basic/passthrough_pykernel/run_makefile_iron.lit delete mode 100644 programming_examples/basic/row_wise_bias_add/aie2.py rename programming_examples/basic/row_wise_bias_add/{aie2_iron.py => row_wise_bias_add.py} (100%) rename programming_examples/basic/row_wise_bias_add/{aie2_alt.py => row_wise_bias_add_alt.py} (100%) delete mode 100644 programming_examples/basic/row_wise_bias_add/run_makefile_iron.lit rename programming_examples/basic/tiling_exploration/per_tile/{aie2_iron.py => per_tile.py} (98%) rename programming_examples/basic/tiling_exploration/tile_group/{aie2_iron.py => tile_group.py} (98%) delete mode 100644 programming_examples/basic/vector_exp/aie2.py rename programming_examples/basic/vector_exp/{aie2_iron.py => vector_exp.py} (98%) rename programming_examples/basic/vector_exp/{aie2_alt.py => vector_exp_alt.py} (98%) delete mode 100644 programming_examples/basic/vector_reduce_add/aie2.py delete mode 100644 programming_examples/basic/vector_reduce_add/run_makefile_iron.lit rename programming_examples/basic/vector_reduce_add/{aie2_iron.py => vector_reduce_add.py} (100%) rename programming_examples/basic/vector_reduce_add/{aie2_alt.py => vector_reduce_add_alt.py} (100%) diff --git a/programming_examples/basic/dma_transpose/README.md b/programming_examples/basic/dma_transpose/README.md index c41afc78a8..9ddad9b9e7 100644 --- a/programming_examples/basic/dma_transpose/README.md +++ b/programming_examples/basic/dma_transpose/README.md @@ -33,24 +33,24 @@ The implicit copy is performed using the `ObjectFifo.forward()` function that sp The `object_fifo_link` operation used explicitly by`dma_transpose.py` and `dma_transpose._alt.py` is described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide. To compile and run the design `dma_transpose_iron.py` for NPU: -```bash +```shell make env use_iron=1 make run ``` To compile and run the design `dma_transpose.py` for NPU: -```bash +```shell make make run ``` To compile and run the design `dma_transpose_alt.py` for NPU: -```bash +```shell make env use_alt=1 make run ``` To generate a data visualization of the transpose (like that above), run: -```bash +```shell make generate_access_map ``` \ No newline at end of file diff --git a/programming_examples/basic/matrix_multiplication/cascade/README.md b/programming_examples/basic/matrix_multiplication/cascade/README.md index 8c9b3a167a..346ae4c2aa 100644 --- a/programming_examples/basic/matrix_multiplication/cascade/README.md +++ b/programming_examples/basic/matrix_multiplication/cascade/README.md @@ -20,5 +20,5 @@ The current design only works for scalar `int16`. The performance sweep results against `whole_array` can be found at [here](https://gist.github.com/Yu-Zhewen/da3fed9feb278b973f35fb78c2d3a484), no gain observed. -The orignal implementation of the design is found at [matmul.py](./matmul.py). An alternative version of the design, featuring different runtime operations, -is found at [matmul_alt.py](./matmul_alt.py). \ No newline at end of file +The orignal implementation of the design is found at [cascade.py](./cascade.py). An alternative version of the design, featuring different runtime operations, +is found at [cascade_alt.py](./cascade_alt.py). \ No newline at end of file diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile index 7db355f786..ec0afe7c59 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile @@ -12,7 +12,7 @@ subdir=matrix_vector targetname=matrix_vector # Currently does not accept reconfiguring size via these variables; must change -# in source at matmul.py as well as here +# in source at .py as well as here M=288 K=288 N=1 diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/README.md b/programming_examples/basic/matrix_multiplication/matrix_vector/README.md index 24fb0893ed..92b1d3b20f 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/README.md +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/README.md @@ -14,8 +14,8 @@ In this design, one or multiple AI Engine compute cores (spread across hardware > This design relies on the same basic concepts as the [whole-array matrix-matrix multiplication design](../whole_array/README.md), and it is structured very similarly to that design. Please refer to the in-depth explanation of that design along with the below outlined differences for a better understanding of this design. -The orignal implementation of the design is found at [matmul.py](./matmul.py). An alternative version of the design, featuring different runtime operations, -is found at [matmul_alt.py](./matmul_alt.py). A version written in a higher-level form of IRON is found at [matmul_iron.py](./matmul_iron.py). +The orignal implementation of the design is found at [matrix_vector.py](./matrix_vector.py). An alternative version of the design, featuring different runtime operations, +is found at [matrix_vector_alt.py](./matrix_vector_alt.py). A version written in a higher-level form of IRON is found at [matrix_vector_iron.py](./matrix_vector_iron.py). ## Differences from the [Whole-Array Matrix-Matrix Multiplication Design](../whole_array/README.md) @@ -28,22 +28,22 @@ is found at [matmul_alt.py](./matmul_alt.py). A version written in a higher-leve You need C++23 for `bfloat16_t` support. It can be found in g++-13: https://lindevs.com/install-g-on-ubuntu To compile and run the original design: -``` +```shell make env use_alt=1 -make env use_alt=1 matrixVectorMultiplication.exe +make env use_alt=1 matrix_vector.exe make env use_alt=1 run ``` To compile and run the alternative design: -``` +```shell make env use_alt=1 -make env use_alt=1 matrixVectorMultiplication.exe +make env use_alt=1 matrix_vector.exe make env use_alt=1 run ``` To compile and run the higher-level IRON design: -``` +```shell make env use_iron=1 -make env use_iron=1 matrixVectorMultiplication.exe +make env use_iron=1 matrix_vector.exe make env use_iron=1 run ``` diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile.chess b/programming_examples/basic/matrix_multiplication/single_core/Makefile.chess index b975017171..f80265911a 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/Makefile.chess +++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile.chess @@ -26,7 +26,7 @@ target_suffix=${M}x${K}x${N}_${m}x${k}x${n} use_alt?=0 ifeq (${use_alt}, 1) -aie_py_src=matmul_alt.py +aie_py_src=${targetname}_alt.py endif include ${srcdir}/../makefile-common diff --git a/programming_examples/basic/matrix_multiplication/single_core/README.md b/programming_examples/basic/matrix_multiplication/single_core/README.md index 47f3c28bd0..986a971c3e 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/README.md +++ b/programming_examples/basic/matrix_multiplication/single_core/README.md @@ -19,35 +19,35 @@ In this design, a single AI Engine compute core performs a matrix-matrix-multipl * This design supports tracing; See [below](#tracing). * Only a single core performs computations. As such, we only need a single ObjectFIFO for each of the transfers between the levels (shim → memory, memory → compute, and back). These ObjectFIFOs are named `inA`, `inB`, `outC` and `memA`, `memB` and `memC`, respectively. -## Notes on the `matmul_alt.py` Implementation +## Notes on the `single_core_alt.py` Implementation -As in the whole-array design, the [`matmul.py`](./matmul.py) file describes the data movement of the design. This single core example also comes with an alternative implementation, which can be found in [`matmul_alt.py`](./matmul_alt.py). If you specify `use_alt=1` as an environment variable at compile time, this alternative implementation will be used in place of `matmul.py`. +As in the whole-array design, the [`single_core.py`](./single_core.py) file describes the data movement of the design. This single core example also comes with an alternative implementation, which can be found in [`single_core_alt.py`](./single_core_alt.py). If you specify `use_alt=1` as an environment variable at compile time, this alternative implementation will be used in place of `single_core.py`. -Functionally, `matmul.py` and `matmul_alt.py` are intended to be identical. However, `matmul_alt.py` is implemented using a new syntax for runtime buffer descriptor configuration on the shim. Specifically, `matmul_alt.py` uses the `aiex.dma_configure_task_for`, `aiex.dma_start_task` and `aiex.dma_await_task` operations instead of `aiex.dma_memcpy_nd`. +Functionally, `single_core.py` and `single_core_alt.py` are intended to be identical. However, `single_core_alt.py` is implemented using a new syntax for runtime buffer descriptor configuration on the shim. Specifically, `single_core_alt.py` uses the `aiex.dma_configure_task_for`, `aiex.dma_start_task` and `aiex.dma_await_task` operations instead of `aiex.dma_memcpy_nd`. -## Notes on the `matmul_iron.py` Implementation +## Notes on the `single_core_iron.py` Implementation -There is an implementation of this design found in [`matmul_iron.py`](./matmul_iron.py) using a higher-level version of IRON. If you specify `use_iron=1` as an environment variable at compile time, this alternative implementation will be used in place of `matmul.py`. +There is an implementation of this design found in [`single_core_iron.py`](./single_core_iron.py) using a higher-level version of IRON. If you specify `use_iron=1` as an environment variable at compile time, this alternative implementation will be used in place of `single_core.py`. -Functionally, this design is intended to be identical to the other two. However, `matmul_iron.py` currently does not support tracing. +Functionally, this design is intended to be identical to the other two. However, `single_core_iron.py` currently does not support tracing. ## Building and Running the Design You need C++23 for bfloat16_t support. It can be found in g++-13: https://lindevs.com/install-g-on-ubuntu To compile design: -``` +```shell make -make matrixMultiplication.exe +make single_core.exe ``` To run the design: -``` +```shell make run ``` ## Tracing -To get tracing output, set `enable_tracing=True` in `matmul.py` and `ENABLE_TRACING=true` in `test.cpp`. +To get tracing output, set `enable_tracing=True` in `single_core.py` and `ENABLE_TRACING=true` in `test.cpp`. Tracing is also supported in `single_core_alt.py`. By default, traces will be written out to `trace.txt`; another output file can be specified using the `--trace` (or `-t`) flag to the host code. diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index d4ad70e0eb..721a0cac94 100755 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -13,25 +13,17 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common device = npu -targetname = passThroughKernel +targetname = passthrough_kernel VPATH := ${srcdir}/../../../aie_kernels/generic data_size = 4096 trace_size = 8192 PASSTHROUGH_SIZE = ${data_size} -aie_py_src=aie2.py +aie_py_src=${targetname}.py use_alt?=0 -use_iron?=0 ifeq (${use_alt}, 1) -aie_py_src=aie2_alt.py -ifeq (${use_iron}, 1) -$(error Cannot specify both alternative design and IRON) -endif -endif - -ifeq (${use_iron}, 1) -aie_py_src=aie2_iron.py +aie_py_src=${targetname}_alt.py endif .PHONY: all template clean diff --git a/programming_examples/basic/passthrough_kernel/README.md b/programming_examples/basic/passthrough_kernel/README.md index 563c0d1185..0ba4bb3b88 100644 --- a/programming_examples/basic/passthrough_kernel/README.md +++ b/programming_examples/basic/passthrough_kernel/README.md @@ -14,7 +14,9 @@ This IRON design flow example, called "Passthrough Kernel", demonstrates a simpl ## Source Files Overview -1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). +1. `passthrough_kernel.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +1. `passthrough_kernel_alt.py`: A Python script that defines the AIE array structural design using an alternative IRON syntax that yields MLIR-AIE operations. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 1. `passThrough.cc`: A C++ implementation of vectorized memcpy operations for AIE cores. Found [here](../../../aie_kernels/generic/passThrough.cc). @@ -28,15 +30,15 @@ This IRON design flow example, called "Passthrough Kernel", demonstrates a simpl This simple example effectively passes data through a single compute tile in the NPU's AIE array. The design is described as shown in the figure to the right. The overall design flow is as follows: 1. An object FIFO called "of_in" connects a Shim Tile to a Compute Tile, and another called "of_out" connects the Compute Tile back to the Shim Tile. -1. The runtime data movement is expressed to read `4096` uint8_t data from host memory to the compute tile and write the `4096` data back to host memory. +1. The runtime data movement is expressed to read `4096` `uint8_t` data from host memory to the compute tile and write the `4096` data back to host memory. 1. The compute tile acquires this input data in "object" sized (`1024`) blocks from "of_in" and copies them to another output "object" it has acquired from "of_out". Note that a vectorized kernel running on the Compute Tile's AIE core copies the data from the input "object" to the output "object". 1. After the vectorized copy is performed, the Compute Tile releases the "objects", allowing the DMAs (abstracted by the object FIFO) to transfer the data back to host memory and copy additional blocks into the Compute Tile, "of_out" and "of_in" respectively. -It is important to note that the Shim Tile and Compute Tile DMAs move data concurrently, and the Compute Tile's AIE Core also processes data concurrently with the data movement. This is made possible by expressing depth `2` in declaring, for example, `object_fifo("in", ShimTile, ComputeTile2, 2, line_ty)` to denote ping-pong buffers. +It is important to note that the Shim Tile and Compute Tile DMAs move data concurrently, and the Compute Tile's AIE Core also processes data concurrently with the data movement. This is made possible by expressing depth `2` in declaring the ObjectFifo, for example, `ObjectFifo(line_ty, name="in", default_depth=2)` to denote ping-pong buffers. If `default_depth` is not declared, the default is `2` in reference to this pattern. ## Design Component Details -### AIE Array Structural Design +### AIE Array Structural Alternative Design This design performs a memcpy operation on a vector of input data. The AIE design is described in a Python module as follows: @@ -66,34 +68,35 @@ This design performs a memcpy operation on a vector of input data. The AIE desig 1. **Vectorized Copying:** The `passThrough_aie()` function processes multiple data elements simultaneously, taking advantage of AIE vector datapath capabilities to load, copy and store data elements. -1. **C-style Wrapper Functions:** `passThroughLine()` and `passThroughTile()` are two C-style wrapper functions to call the templated `passThrough_aie()` vectorized memcpy implementation from the AIE design implemented in `aie2.py`. The `passThroughLine()` and `passThroughTile()` functions are compiled for `uint8_t`, `int16_t`, or `int32_t` determined by the value the `BIT_WIDTH` variable defines. +1. **C-style Wrapper Functions:** `passThroughLine()` and `passThroughTile()` are two C-style wrapper functions to call the templated `passThrough_aie()` vectorized memcpy implementation from the AIE design implemented in `passthrough_kernel.py`. The `passThroughLine()` and `passThroughTile()` functions are compiled for `uint8_t`, `int16_t`, or `int32_t` determined by the value the `BIT_WIDTH` variable defines. ## Usage -### C++ Testbench +### Compilation To compile the design: -``` +```shell make ``` +To compile the alternative design: +```shell +env use_alt=1 make +``` + +### C++ Testbench + To complete compiling the C++ testbench and run the design: -``` +```shell make run ``` ### Python Testbench -To compile the design: - -``` -make -``` - To run the design: -``` +```shell make run_py ``` diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py deleted file mode 100755 index ff03ab0bd8..0000000000 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ /dev/null @@ -1,102 +0,0 @@ -# passthrough_kernel/aie2.py -*- Python -*- -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -import numpy as np -import sys - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.extras.context import mlir_mod_ctx -from aie.helpers.dialects.ext.scf import _for as range_ - -import aie.utils.trace as trace_utils - - -def passthroughKernel(dev, vector_size, trace_size): - N = vector_size - lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - - @device(dev) - def device_body(): - # define types - vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - - # AIE Core Function declarations - passThroughLine = external_func( - "passThroughLine", inputs=[line_ty, line_ty, np.int32] - ) - - # Tile declarations - ShimTile = tile(0, 0) - ComputeTile2 = tile(0, 2) - - # Set up a circuit-switched flow from core to shim for tracing information - if trace_size > 0: - flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) - - # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "passThrough.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) - elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) - passThroughLine(elemIn, elemOut, lineWidthInBytes) - of_in.release(ObjectFifoPort.Consume, 1) - of_out.release(ObjectFifoPort.Produce, 1) - - @runtime_sequence(vector_ty, vector_ty, vector_ty) - def sequence(inTensor, outTensor, notUsed): - if trace_size > 0: - trace_utils.configure_simple_tracing_aie2( - ComputeTile2, - ShimTile, - ddr_id=1, - size=trace_size, - offset=N, - ) - - npu_dma_memcpy_nd( - metadata=of_in, - bd_id=0, - mem=inTensor, - sizes=[1, 1, 1, N], - issue_token=True, - ) - npu_dma_memcpy_nd( - metadata=of_out, - bd_id=1, - mem=outTensor, - sizes=[1, 1, 1, N], - ) - dma_wait(of_in, of_out) - - -try: - device_name = str(sys.argv[1]) - if device_name == "npu": - dev = AIEDevice.npu1_1col - elif device_name == "npu2": - dev = AIEDevice.npu2 - else: - raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - vector_size = int(sys.argv[2]) - if vector_size % 64 != 0 or vector_size < 512: - print("Vector size must be a multiple of 64 and greater than or equal to 512") - raise ValueError - trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) -except ValueError: - print("Argument has inappropriate value") -with mlir_mod_ctx() as ctx: - passthroughKernel(dev, vector_size, trace_size) - print(ctx.module) diff --git a/programming_examples/basic/passthrough_kernel/aie2_iron.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel.py similarity index 97% rename from programming_examples/basic/passthrough_kernel/aie2_iron.py rename to programming_examples/basic/passthrough_kernel/passthrough_kernel.py index 654a73923f..36f1a92962 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_iron.py +++ b/programming_examples/basic/passthrough_kernel/passthrough_kernel.py @@ -1,4 +1,4 @@ -# passthrough_kernel/aie2_iron.py -*- Python -*- +# passthrough_kernel/passthrough_kernel.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel_alt.py similarity index 98% rename from programming_examples/basic/passthrough_kernel/aie2_alt.py rename to programming_examples/basic/passthrough_kernel/passthrough_kernel_alt.py index 96f6171955..2f4260ea8a 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_kernel/passthrough_kernel_alt.py @@ -1,4 +1,4 @@ -# passthrough_kernel/aie2_alt.py -*- Python -*- +# passthrough_kernel/passthrough_kernel_alt.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/passthrough_kernel/run_makefile_iron.lit b/programming_examples/basic/passthrough_kernel/run_makefile_iron.lit deleted file mode 100644 index 2467315910..0000000000 --- a/programming_examples/basic/passthrough_kernel/run_makefile_iron.lit +++ /dev/null @@ -1,10 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai, peano -// -// RUN: mkdir -p test_iron -// RUN: cd test_iron -// RUN: make -f %S/Makefile clean -// RUN: env use_iron=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run_py \ No newline at end of file diff --git a/programming_examples/basic/passthrough_pykernel/Makefile b/programming_examples/basic/passthrough_pykernel/Makefile index 04e292c8fa..a5101c5e1b 100644 --- a/programming_examples/basic/passthrough_pykernel/Makefile +++ b/programming_examples/basic/passthrough_pykernel/Makefile @@ -12,24 +12,16 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common -targetname = passThroughPyKernel +targetname = passthrough_pykernel VPATH := ${srcdir}/../../../aie_kernels/generic data_size = 4096 PASSTHROUGH_SIZE = ${data_size} -aie_py_src=aie2.py +aie_py_src=${targetname}.py use_alt?=0 -use_iron?=0 ifeq (${use_alt}, 1) -aie_py_src=aie2_alt.py -ifeq (${use_iron}, 1) -$(error Cannot specify both alternative design and IRON) -endif -endif - -ifeq (${use_iron}, 1) -aie_py_src=aie2_iron.py +aie_py_src=${targetname}_alt.py endif .PHONY: all template clean diff --git a/programming_examples/basic/passthrough_pykernel/README.md b/programming_examples/basic/passthrough_pykernel/README.md index 02e3e3ec24..dd9ef46662 100644 --- a/programming_examples/basic/passthrough_pykernel/README.md +++ b/programming_examples/basic/passthrough_pykernel/README.md @@ -10,17 +10,19 @@ # Passthrough Kernel: -This IRON design flow example, called "Passthrough Kernel", demonstrates a simple AIE implementation for a non-vectorized (scalar) memcpy on a vector of integers. In this design, a single AIE core performs the memcpy operation on a vector with a default length `4096`. The kernel, defined in Python code as a function, is configured to work on `1024` element-sized subvectors and is invoked multiple times to complete the full copy. The example consists of two primary design files: `aie2.py` and `passThrough.cc`, and a testbench `test.cpp` or `test.py`. +This IRON design flow example, called "Passthrough Kernel", demonstrates a simple AIE implementation for a non-vectorized (scalar) memcpy on a vector of integers. In this design, a single AIE core performs the memcpy operation on a vector with a default length `4096`. The kernel, defined in Python code as a function, is configured to work on `1024` element-sized subvectors and is invoked multiple times to complete the full copy. The example consists of two primary design files: `passthrough_pykernel.py` and `passThrough.cc`, and a testbench `test.cpp` or `test.py`. ## Source Files Overview -1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). +1. `passthrough_pykernel.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +1. `passthrough_pykernel_alt.py`: A Python script that defines an alternative AIE array structural design using MLIR-AIE operations defined with a lower-level version of IRON than that used in `passthrough_pykernel.py`. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 1. `test.cpp`: This C++ code is a testbench for the Passthrough Kernel design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. 1. `test.py`: This Python code is a testbench for the Passthrough Kernel design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the memcpy results and optionally outputs trace data. -1. `passthrough_pykernel.ipynb`: This notebook contains the design (which is duplicated from `aie2.py`) and test code (which is duplicated from `test.py`) for an alternate way of interacting with the example. +1. `passthrough_pykernel.ipynb`: This notebook contains the design (which is duplicated from `passthrough_pykernel_alt.py`) and test code (which is duplicated from `test.py`) for an alternate way of interacting with the example. ## Design Overview @@ -32,11 +34,11 @@ This simple example effectively passes data through a single compute tile in the 1. The compute tile acquires this input data in "object" sized (`1024`) blocks from "of_in" and copies them to another output "object" it has acquired from "of_out". A scalar kernel defined via a Python fucntion is invoked on the Compute Tile's AIE core to copy the data from the input "object" to the output "object". 1. After the copy is performed, the Compute Tile releases the "objects", allowing the DMAs (abstracted by the object FIFO) to transfer the data back to host memory and copy additional blocks into the Compute Tile, "of_out" and "of_in" respectively. -It is important to note that the Shim Tile and Compute Tile DMAs move data concurrently, and the Compute Tile's AIE Core also processes data concurrently with the data movement. This is made possible by expressing depth `2` in declaring, for example, `object_fifo("in", ShimTile, ComputeTile2, 2, line_ty)` to denote ping-pong buffers. +It is important to note that the Shim Tile and Compute Tile DMAs move data concurrently, and the Compute Tile's AIE Core also processes data concurrently with the data movement. This is made possible by expressing `default_depth` is `2` when constructing the `ObjectFifo`, for example, `ObjectFifo(line_ty, default_depth=2)` to denote ping-pong buffers. By default, the depth is `2` in recognition of this common pattern. ## Design Component Details -### AIE Array Structural Design +### AIE Array Structural Alternative Design This design performs a memcpy operation on a vector of input data. The AIE design is described in a Python module as follows: @@ -62,31 +64,32 @@ This design performs a memcpy operation on a vector of input data. The AIE desig ## Usage -### C++ Testbench +### Compile the desing: To compile the design: -```bash +```shell make ``` +To compile the alternative design: +```shell +make env use_alt=1 +``` + +### C++ Testbench + To complete compiling the C++ testbench and run the design: -```bash +```shell make run ``` ### Python Testbench -To compile the design: - -```bash -make -``` - To run the design: -```bash +```shell make run_py ``` @@ -97,7 +100,7 @@ make run_py Make sure you use a terminal that has run the `utils/setup_env.sh` script so that the correct environment variables are percolated to jupyter. Below is an example of how to start a jupyter server: - ```bash + ```shell python3 -m jupyter notebook --no-browser --port=8080 ``` * In your browser, navigate to the URL (which includes a token) which is found @@ -107,7 +110,7 @@ make run_py * You should now be good to go! #### Run the Notebook as a Script -```bash +```shell make clean_notebook make run_notebook ``` \ No newline at end of file diff --git a/programming_examples/basic/passthrough_pykernel/aie2.py b/programming_examples/basic/passthrough_pykernel/aie2.py deleted file mode 100644 index 49c459391e..0000000000 --- a/programming_examples/basic/passthrough_pykernel/aie2.py +++ /dev/null @@ -1,84 +0,0 @@ -# passthrough_pykernel/aie2.py -*- Python -*- -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -import numpy as np -import sys - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.extras.context import mlir_mod_ctx -from aie.helpers.dialects.ext.func import func -from aie.helpers.dialects.ext.scf import _for as range_ - - -def passthroughKernel(vector_size): - N = vector_size - lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - - @device(AIEDevice.npu1_1col) - def device_body(): - # define types - line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - - # AIE Core Python Function declarations - @func(emit=True) - def passThroughLine(input: line_ty, output: line_ty, lineWidth: np.int32): - for i in range_(lineWidth): - output[i] = input[i] - - # Tile declarations - ShimTile = tile(0, 0) - ComputeTile2 = tile(0, 2) - - # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2) - def core_body(): - for _ in range_(sys.maxsize): - elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) - elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) - passThroughLine(elemIn, elemOut, lineWidthInBytes) - of_in.release(ObjectFifoPort.Consume, 1) - of_out.release(ObjectFifoPort.Produce, 1) - - # print(ctx.module.operation.verify()) - - vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] - - @runtime_sequence(vector_ty, vector_ty, vector_ty) - def sequence(inTensor, outTensor, notUsed): - npu_dma_memcpy_nd( - metadata=of_in, - bd_id=0, - mem=inTensor, - sizes=[1, 1, 1, N], - issue_token=True, - ) - npu_dma_memcpy_nd( - metadata=of_out, - bd_id=1, - mem=outTensor, - sizes=[1, 1, 1, N], - ) - dma_wait(of_in, of_out) - - -try: - vector_size = int(sys.argv[1]) - if vector_size % 64 != 0 or vector_size < 512: - print("Vector size must be a multiple of 64 and greater than or equal to 512") - raise ValueError -except ValueError: - print("Argument has inappropriate value") -with mlir_mod_ctx() as ctx: - passthroughKernel(vector_size) - print(ctx.module) diff --git a/programming_examples/basic/passthrough_pykernel/aie2_iron.py b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.py similarity index 97% rename from programming_examples/basic/passthrough_pykernel/aie2_iron.py rename to programming_examples/basic/passthrough_pykernel/passthrough_pykernel.py index 582fd5dbc2..794ba13dbd 100644 --- a/programming_examples/basic/passthrough_pykernel/aie2_iron.py +++ b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.py @@ -1,4 +1,4 @@ -# passthrough_pykernel/aie2_iron.py -*- Python -*- +# passthrough_pykernel/passthrough_pykernel.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/passthrough_pykernel/aie2_alt.py b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_alt.py similarity index 97% rename from programming_examples/basic/passthrough_pykernel/aie2_alt.py rename to programming_examples/basic/passthrough_pykernel/passthrough_pykernel_alt.py index 007d986225..e003bb46df 100644 --- a/programming_examples/basic/passthrough_pykernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_alt.py @@ -1,4 +1,4 @@ -# passthrough_pykernel/aie2_alt.py -*- Python -*- +# passthrough_pykernel/passthrough_pykernel_alt.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/passthrough_pykernel/run_makefile_iron.lit b/programming_examples/basic/passthrough_pykernel/run_makefile_iron.lit deleted file mode 100644 index ba7ff133fb..0000000000 --- a/programming_examples/basic/passthrough_pykernel/run_makefile_iron.lit +++ /dev/null @@ -1,12 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai, peano -// -// RUN: mkdir -p test_iron -// RUN: cd test_iron -// RUN: make -f %S/Makefile clean -// RUN: env use_iron=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s -// CHECK: PASS! - \ No newline at end of file diff --git a/programming_examples/basic/row_wise_bias_add/Makefile b/programming_examples/basic/row_wise_bias_add/Makefile index 5a14a57905..a5bbbfc0cf 100644 --- a/programming_examples/basic/row_wise_bias_add/Makefile +++ b/programming_examples/basic/row_wise_bias_add/Makefile @@ -37,19 +37,11 @@ N=2304 m=96 n=32 -aie_py_src=aie2.py +aie_py_src=row_wise_bias_add.py use_alt?=0 -use_iron?=0 ifeq (${use_alt}, 1) -aie_py_src=aie2_alt.py -ifeq (${use_iron}, 1) -$(error Cannot specify both alternative design and IRON) -endif -endif - -ifeq (${use_iron}, 1) -aie_py_src=aie2_iron.py +aie_py_src=row_wise_bias_add_alt.py endif .PHONY: all diff --git a/programming_examples/basic/row_wise_bias_add/README.md b/programming_examples/basic/row_wise_bias_add/README.md index 0912f3f6d2..f03a106b64 100644 --- a/programming_examples/basic/row_wise_bias_add/README.md +++ b/programming_examples/basic/row_wise_bias_add/README.md @@ -17,11 +17,12 @@ Conceptually, `bias` is broadcast into a `M`×`N` matrix by repeating it `M ## Data Movement -The data movement and call into the kernel (see below) is described in `aie2.py`. +The data movement and call into the kernel (see below) is described in `row_wise_bias_add.py`. An alternative design that uses a lower-level +form of IRON is available in `row_wise_bias_add_alt.py`. A single AIE core is configured to process chunks of `m`×`n` of `in` and chunks of `n` of `bias` to produce `m`×`n` chunks of output. Therefore, the output is tiled into `M/m`×`N/n` tiles, and the kernel function is called that number of times. -To avoid unnecessarily reloading the `bias` vector, we iterate through these tiles in a column-major fashion. -The `strides` and `sizes` in the `aie.runtime_sequence` operation describe this column-major iteration. +To avoid unnecessarily reloading the `bias` vector, we iterate through these tiles in a column-major fashion by calling the `TensorTiler2D.group_tiler` +with argument `tile_group_col_major=True`. ## Kernel diff --git a/programming_examples/basic/row_wise_bias_add/aie2.py b/programming_examples/basic/row_wise_bias_add/aie2.py deleted file mode 100644 index 46e792ac48..0000000000 --- a/programming_examples/basic/row_wise_bias_add/aie2.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 AMD Inc. -import numpy as np -import sys - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.extras.context import mlir_mod_ctx -from aie.helpers.dialects.ext.scf import _for as range_ -from aie.helpers.taplib import TensorTiler2D - - -def row_wise_bias_add(M, N, m, n): - - assert M % m == 0 - assert N % n == 0 - - @device(AIEDevice.npu1_1col) - def device_body(): - - tensor_ty = np.ndarray[(m * n,), np.dtype[np.float32]] - bias_ty = np.ndarray[(n,), np.dtype[np.float32]] - - kernel_func = external_func( - f"row_wise_bias_add_f32_f32", inputs=[tensor_ty, bias_ty, tensor_ty] - ) - - shim_tile = tile(0, 0) - compute_tile = tile(0, 2) - - in_fifo = object_fifo("in_fifo", shim_tile, compute_tile, 2, tensor_ty) - bias_fifo = object_fifo("bias_fifo", shim_tile, compute_tile, 2, bias_ty) - out_fifo = object_fifo("out_fifo", compute_tile, shim_tile, 2, tensor_ty) - - @core(compute_tile, "kernel.o") - def core_body(): - for _ in range_(0xFFFFFFFF): - for _ in range_(N // n): - elem_bias = bias_fifo.acquire(ObjectFifoPort.Consume, 1) - for _ in range_(M // m): - elem_in = in_fifo.acquire(ObjectFifoPort.Consume, 1) - elem_out = out_fifo.acquire(ObjectFifoPort.Produce, 1) - kernel_func(elem_in, elem_bias, elem_out) - out_fifo.release(ObjectFifoPort.Produce, 1) - in_fifo.release(ObjectFifoPort.Consume, 1) - bias_fifo.release(ObjectFifoPort.Consume, 1) - - tiler = TensorTiler2D.group_tiler( - (M, N), (m, n), (M // m, N // n), tile_group_col_major=True - ) - bias_tiler = TensorTiler2D.group_tiler((1, N), (1, n), (1, N // n)) - - @runtime_sequence(tensor_ty, bias_ty, tensor_ty) - def sequence(inp, bias, out): - npu_dma_memcpy_nd( - metadata=in_fifo, - bd_id=0, - mem=inp, - tap=tiler[0], - ) - npu_dma_memcpy_nd( - metadata=bias_fifo, - bd_id=1, - mem=bias, - tap=bias_tiler[0], - ) - npu_dma_memcpy_nd( - metadata=out_fifo, - bd_id=2, - mem=out, - tap=tiler[0], - ) - # of_out will only complete after of_in completes, so we just wait on of_out instead of both - dma_wait(out_fifo) - - -# Declares that subsequent code is in mlir-aie context -with mlir_mod_ctx() as ctx: - row_wise_bias_add( - int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) - ) - res = ctx.module.operation.verify() - if res == True: - print(ctx.module) - else: - print(res) diff --git a/programming_examples/basic/row_wise_bias_add/aie2_iron.py b/programming_examples/basic/row_wise_bias_add/row_wise_bias_add.py similarity index 100% rename from programming_examples/basic/row_wise_bias_add/aie2_iron.py rename to programming_examples/basic/row_wise_bias_add/row_wise_bias_add.py diff --git a/programming_examples/basic/row_wise_bias_add/aie2_alt.py b/programming_examples/basic/row_wise_bias_add/row_wise_bias_add_alt.py similarity index 100% rename from programming_examples/basic/row_wise_bias_add/aie2_alt.py rename to programming_examples/basic/row_wise_bias_add/row_wise_bias_add_alt.py diff --git a/programming_examples/basic/row_wise_bias_add/run_makefile_iron.lit b/programming_examples/basic/row_wise_bias_add/run_makefile_iron.lit deleted file mode 100644 index fde085a5fe..0000000000 --- a/programming_examples/basic/row_wise_bias_add/run_makefile_iron.lit +++ /dev/null @@ -1,11 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai, peano -// -// RUN: mkdir -p test_iron -// RUN: cd test_iron -// RUN: make -f %S/Makefile clean -// RUN: env use_iron=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! diff --git a/programming_examples/basic/tiling_exploration/per_tile/Makefile b/programming_examples/basic/tiling_exploration/per_tile/Makefile index ec187f74df..d0071c1309 100644 --- a/programming_examples/basic/tiling_exploration/per_tile/Makefile +++ b/programming_examples/basic/tiling_exploration/per_tile/Makefile @@ -17,12 +17,13 @@ tensor_width = 8 tile_height = 2 tile_width = 2 data_str=${tensor_height}_${tensor_width}_${tile_height}_${tile_width} +aie_py_src=per_tile.py .PHONY: all template clean all: build/final_${data_str}.xclbin -build/aie_${data_str}.mlir: ${srcdir}/aie2_iron.py +build/aie_${data_str}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} python3 $< --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} > $@ @@ -35,7 +36,7 @@ build/final_${data_str}.xclbin: build/aie_${data_str}.mlir run: build/final_${data_str}.xclbin build/insts_${data_str}.txt ${powershell} python3 ${srcdir}/test.py -x build/final_${data_str}.xclbin -i build/insts_${data_str}.txt -k MLIR_AIE --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} -generate_access_map: ${srcdir}/aie2_iron.py +generate_access_map: ${srcdir}/${aie_py_src} mkdir -p ${@D} python3 $< --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} --generate-access-map ${M} ${K} diff --git a/programming_examples/basic/tiling_exploration/per_tile/README.md b/programming_examples/basic/tiling_exploration/per_tile/README.md index 9efe4848fd..319603a277 100644 --- a/programming_examples/basic/tiling_exploration/per_tile/README.md +++ b/programming_examples/basic/tiling_exploration/per_tile/README.md @@ -10,11 +10,11 @@ # Tiling Exploration -This IRON design flow example, called "Tiling Exploration: Per Tile", demonstrates how data may be `tiled` into smaller chunks and sent/received through the `runtime_sequence`. This is a common data transformation pattern, and this example is meant to be interactive. +This IRON design flow example, called "Tiling Exploration: Per Tile", demonstrates how data may be `tiled` into smaller chunks and sent/received through the `Runtime.sequence()` function. This is a common data transformation pattern, and this example is meant to be interactive. ## Source Files Overview -1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations and the `TensorTiler2D` to specify `TensorAccessPatterns` (*taps*) of data to be transferred out of the design. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). +1. `per_tile.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations and the `TensorTiler2D` to specify `TensorAccessPatterns` (*taps*) of data to be transferred out of the design. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 1. `test.py`: This Python code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the results against expected output. @@ -34,12 +34,12 @@ This design has no inputs; it produces a single output tensor. The single core u Modify tensor and tile dimensions in the `Makefile`. To compile and run the design for NPU: -```bash +```shell make clean make run ``` To generate a data visualization (like that above), run: -```bash +```shell make generate_access_map ``` diff --git a/programming_examples/basic/tiling_exploration/per_tile/aie2_iron.py b/programming_examples/basic/tiling_exploration/per_tile/per_tile.py similarity index 98% rename from programming_examples/basic/tiling_exploration/per_tile/aie2_iron.py rename to programming_examples/basic/tiling_exploration/per_tile/per_tile.py index 4936640d88..f88bb4f190 100644 --- a/programming_examples/basic/tiling_exploration/per_tile/aie2_iron.py +++ b/programming_examples/basic/tiling_exploration/per_tile/per_tile.py @@ -1,4 +1,4 @@ -# tiling_exploration/per_tile/aie2_iron.py-*- Python -*- +# tiling_exploration/per_tile/per_tile.py-*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/tiling_exploration/tile_group/Makefile b/programming_examples/basic/tiling_exploration/tile_group/Makefile index ec187f74df..8a2f859907 100644 --- a/programming_examples/basic/tiling_exploration/tile_group/Makefile +++ b/programming_examples/basic/tiling_exploration/tile_group/Makefile @@ -17,12 +17,13 @@ tensor_width = 8 tile_height = 2 tile_width = 2 data_str=${tensor_height}_${tensor_width}_${tile_height}_${tile_width} +aie_py_src=tile_group.py .PHONY: all template clean all: build/final_${data_str}.xclbin -build/aie_${data_str}.mlir: ${srcdir}/aie2_iron.py +build/aie_${data_str}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} python3 $< --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} > $@ @@ -35,7 +36,7 @@ build/final_${data_str}.xclbin: build/aie_${data_str}.mlir run: build/final_${data_str}.xclbin build/insts_${data_str}.txt ${powershell} python3 ${srcdir}/test.py -x build/final_${data_str}.xclbin -i build/insts_${data_str}.txt -k MLIR_AIE --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} -generate_access_map: ${srcdir}/aie2_iron.py +generate_access_map: ${srcdir}/${aie_py_src} mkdir -p ${@D} python3 $< --tensor-height ${tensor_height} --tensor-width ${tensor_width} --tile-height ${tile_height} --tile-width ${tile_width} --generate-access-map ${M} ${K} diff --git a/programming_examples/basic/tiling_exploration/tile_group/README.md b/programming_examples/basic/tiling_exploration/tile_group/README.md index 111adc4e93..f6a326aacf 100644 --- a/programming_examples/basic/tiling_exploration/tile_group/README.md +++ b/programming_examples/basic/tiling_exploration/tile_group/README.md @@ -10,11 +10,11 @@ # Tiling Exploration -This IRON design flow example, called "Tiling Exploration: Tile Group", demonstrates how data may be `tiled` into smaller chunks and grouped into collections of tiles and sent/received through the `runtime_sequence`. This is a common data transformation pattern, and this example is meant to be interactive. +This IRON design flow example, called "Tiling Exploration: Tile Group", demonstrates how data may be `tiled` into smaller chunks and grouped into collections of tiles and sent/received through the `runtime.sequence()` function. This is a common data transformation pattern, and this example is meant to be interactive. ## Source Files Overview -1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations and the `TensorTiler2D` to specify `TensorAccessPattern`s (*taps*) of data to be transferred out of the design. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). +1. `tile_group.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations and the `TensorTiler2D` to specify `TensorAccessPattern`s (*taps*) of data to be transferred out of the design. The file generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 1. `test.py`: This Python code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the script verifies the results against expected output. diff --git a/programming_examples/basic/tiling_exploration/tile_group/aie2_iron.py b/programming_examples/basic/tiling_exploration/tile_group/tile_group.py similarity index 98% rename from programming_examples/basic/tiling_exploration/tile_group/aie2_iron.py rename to programming_examples/basic/tiling_exploration/tile_group/tile_group.py index 436e88d6bd..733353feda 100644 --- a/programming_examples/basic/tiling_exploration/tile_group/aie2_iron.py +++ b/programming_examples/basic/tiling_exploration/tile_group/tile_group.py @@ -1,4 +1,4 @@ -# tiling_exploration/tile_group/aie2_iron.py -*- Python -*- +# tiling_exploration/tile_group/tile_group.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile index 9a9180d5e6..10232f10c7 100644 --- a/programming_examples/basic/vector_exp/Makefile +++ b/programming_examples/basic/vector_exp/Makefile @@ -12,21 +12,13 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common -targetname = testExp +targetname = vector_exp -aie_py_src=aie2.py +aie_py_src=${targetname}.py use_alt?=0 -use_iron?=0 ifeq (${use_alt}, 1) aie_py_src=aie2_alt.py -ifeq (${use_iron}, 1) -$(error Cannot specify both alternative design and IRON) -endif -endif - -ifeq (${use_iron}, 1) -aie_py_src=aie2_iron.py endif all: build/final.xclbin build/insts.txt diff --git a/programming_examples/basic/vector_exp/README.md b/programming_examples/basic/vector_exp/README.md index 6c13f33578..c862ac7bfe 100644 --- a/programming_examples/basic/vector_exp/README.md +++ b/programming_examples/basic/vector_exp/README.md @@ -17,7 +17,9 @@ $e^x$ is typically used in machine learning applications with relatively small n ## Source Files Overview -1. `aie2.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). +1. `vector_exp.py`: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (i.e., XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +1. `vector_exp_alt.py`: A functionally equivalent design to `vector_exp.py` that uses a lower-level IRON API than `vector_exp.py` 1. `bf16_exp.cc`: A C++ implementation of vectorized table lookup operations for AIE cores. The lookup operation `getExpBf16` operates on vectors of size `16`, loading the vectorized accumulator registers with the look up table results. It is then necessary to copy the accumulator register to a regular vector register before storing it back into memory. The source can be found [here](../../../aie_kernels/aie2/bf16_exp.cc). @@ -28,22 +30,27 @@ The design also uses a single file from the AIE runtime to initialize the look u ## Usage -### C++ Testbench +### Compilation To compile the design: - -``` +```shell make ``` +To compile the alternative design: +```shell +make env use_alt=1 +``` + +### C++ Testbench + To compile the C++ testbench: ``` -make testExp.exe +make text_exp.exe ``` To run the design: - ``` make run ``` diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py deleted file mode 100644 index 8691928358..0000000000 --- a/programming_examples/basic/vector_exp/aie2.py +++ /dev/null @@ -1,125 +0,0 @@ -# vector_exp/aie2.py -*- Python -*- -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -import numpy as np -from ml_dtypes import bfloat16 - -from aie.dialects.aie import * # primary mlir-aie dialect definitions -from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper - -from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.helpers.dialects.ext.scf import ( - _for as range_, -) # scf (structured control flow) dialect -from aie.helpers.util import np_ndarray_type_get_shape - - -# AI Engine structural design function -def my_eltwise_exp(): - - N = 65536 - - # Tile sizes - n = 1024 - N_div_n = N // n - - n_cores = 4 - tiles = N_div_n // n_cores - buffer_depth = 2 - - # Device declaration - aie2 device NPU (aka Ryzen AI) - @device(AIEDevice.npu1_1col) - def device_body(): - - tile_ty = np.ndarray[(n,), np.dtype[bfloat16]] - - # Type used in the tile memory - A_ty = np.ndarray[(n,), np.dtype[bfloat16]] - C_ty = np.ndarray[(n,), np.dtype[bfloat16]] - - # Type used in the memory tile which aggregates across the 4 cores - A_memTile_ty = np.ndarray[(n * n_cores,), np.dtype[bfloat16]] - C_memTile_ty = np.ndarray[(n * n_cores,), np.dtype[bfloat16]] - - # AIE Core Function declarations - - exp_bf16_1024 = external_func("exp_bf16_1024", inputs=[tile_ty, tile_ty]) - - # Tile declarations - ShimTile = tile(0, 0) - - MemTile = tile(0, 1) - cores = [tile(0, 2 + i) for i in range(n_cores)] - - inA_fifos = [] - outC_fifos = [] - - # AIE-array data movement with object fifos - # Input A - inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, A_memTile_ty) - for i in range(n_cores): - inA_fifos.append( - object_fifo(f"memA{i}", MemTile, cores[i], buffer_depth, A_ty) - ) - if n_cores > 1: - of_offsets = [ - (np.prod(np_ndarray_type_get_shape(A_memTile_ty)) // n_cores) * i - for i in range(n_cores) - ] - else: - of_offsets = [] - object_fifo_link(inA, inA_fifos, [], of_offsets) - - # Output C - for i in range(n_cores): - outC_fifos.append( - object_fifo(f"memC{i}", cores[i], MemTile, buffer_depth, C_ty) - ) - outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, C_memTile_ty) - if n_cores > 1: - of_offsets = [ - (np.prod(np_ndarray_type_get_shape(C_memTile_ty)) // n_cores) * i - for i in range(n_cores) - ] - else: - of_offsets = [] - object_fifo_link(outC_fifos, outC, of_offsets, []) - - # Compute tile bodies - for i in range(n_cores): - # Compute tile i - @core(cores[i], "kernels.a") - def core_body(): - for _ in range_(0xFFFFFFFF): - for _ in range_(tiles): - elem_out = outC_fifos[i].acquire(ObjectFifoPort.Produce, 1) - elem_in_a = inA_fifos[i].acquire(ObjectFifoPort.Consume, 1) - - exp_bf16_1024(elem_in_a, elem_out) - - inA_fifos[i].release(ObjectFifoPort.Consume, 1) - outC_fifos[i].release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - tensor_ty = np.ndarray[(N,), np.dtype[bfloat16]] - - @runtime_sequence(tensor_ty, tensor_ty) - def sequence(A, C): - npu_dma_memcpy_nd( - metadata=inA, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True - ) - npu_dma_memcpy_nd(metadata=outC, bd_id=0, mem=C, sizes=[1, 1, 1, N]) - dma_wait(inA, outC) - - -with mlir_mod_ctx() as ctx: - my_eltwise_exp() - res = ctx.module.operation.verify() - if res == True: - print(ctx.module) - else: - print(res) diff --git a/programming_examples/basic/vector_exp/aie2_iron.py b/programming_examples/basic/vector_exp/vector_exp.py similarity index 98% rename from programming_examples/basic/vector_exp/aie2_iron.py rename to programming_examples/basic/vector_exp/vector_exp.py index 90cfd0273c..f982bba271 100644 --- a/programming_examples/basic/vector_exp/aie2_iron.py +++ b/programming_examples/basic/vector_exp/vector_exp.py @@ -1,4 +1,4 @@ -# vector_exp/aie2_iron.py -*- Python -*- +# vector_exp/vector_exp.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/vector_exp/aie2_alt.py b/programming_examples/basic/vector_exp/vector_exp_alt.py similarity index 98% rename from programming_examples/basic/vector_exp/aie2_alt.py rename to programming_examples/basic/vector_exp/vector_exp_alt.py index 8420cfa278..6ab6734935 100644 --- a/programming_examples/basic/vector_exp/aie2_alt.py +++ b/programming_examples/basic/vector_exp/vector_exp_alt.py @@ -1,4 +1,4 @@ -# vector_exp/aie2_alt.py -*- Python -*- +# vector_exp/vector_exp_alt.py -*- Python -*- # # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py deleted file mode 100644 index d2c8833e1d..0000000000 --- a/programming_examples/basic/vector_reduce_add/aie2.py +++ /dev/null @@ -1,77 +0,0 @@ -# vector_reduce_add/aie2.py -*- Python -*- -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -import numpy as np -import sys - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.extras.context import mlir_mod_ctx -from aie.helpers.dialects.ext.scf import _for as range_ - - -def my_reduce_add(): - N = 1024 - - buffer_depth = 2 - - if len(sys.argv) != 3: - raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - - if sys.argv[1] == "npu": - dev = AIEDevice.npu1_1col - elif sys.argv[1] == "xcvc1902": - dev = AIEDevice.xcvc1902 - else: - raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - - @device(dev) - def device_body(): - in_ty = np.ndarray[(N,), np.dtype[np.int32]] - out_ty = np.ndarray[(1,), np.dtype[np.int32]] - - # AIE Core Function declarations - reduce_add_vector = external_func( - "reduce_add_vector", inputs=[in_ty, out_ty, np.int32] - ) - - # Tile declarations - ShimTile = tile(int(sys.argv[2]), 0) - ComputeTile2 = tile(int(sys.argv[2]), 2) - - # AIE-array data movement with object fifos - of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, in_ty) - of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, out_ty) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "reduce_add.cc.o") - def core_body(): - for _ in range_(0xFFFFFFFF): - elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) - reduce_add_vector(elem_in, elem_out, N) - of_in.release(ObjectFifoPort.Consume, 1) - of_out.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence(in_ty, out_ty) - def sequence(A, C): - npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, 1]) - # of_out will only complete after of_in completes, so we just wait on of_out instead of both - dma_wait(of_out) - - -with mlir_mod_ctx() as ctx: - my_reduce_add() - res = ctx.module.operation.verify() - if res == True: - print(ctx.module) - else: - print(res) diff --git a/programming_examples/basic/vector_reduce_add/run_makefile_iron.lit b/programming_examples/basic/vector_reduce_add/run_makefile_iron.lit deleted file mode 100644 index 33985759d2..0000000000 --- a/programming_examples/basic/vector_reduce_add/run_makefile_iron.lit +++ /dev/null @@ -1,12 +0,0 @@ -// (c) Copyright 2024 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// REQUIRES: ryzen_ai, peano -// -// RUN: mkdir -p test_iron -// RUN: cd test_iron -// RUN: make -f %S/Makefile clean -// RUN: env use_iron=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! - \ No newline at end of file diff --git a/programming_examples/basic/vector_reduce_add/aie2_iron.py b/programming_examples/basic/vector_reduce_add/vector_reduce_add.py similarity index 100% rename from programming_examples/basic/vector_reduce_add/aie2_iron.py rename to programming_examples/basic/vector_reduce_add/vector_reduce_add.py diff --git a/programming_examples/basic/vector_reduce_add/aie2_alt.py b/programming_examples/basic/vector_reduce_add/vector_reduce_add_alt.py similarity index 100% rename from programming_examples/basic/vector_reduce_add/aie2_alt.py rename to programming_examples/basic/vector_reduce_add/vector_reduce_add_alt.py