Skip to content

Commit

Permalink
Use chess not Peano for trace programming examples and guide (#1886)
Browse files Browse the repository at this point in the history
Co-authored-by: Joseph Melber <[email protected]>
  • Loading branch information
jackl-xilinx and jgmelber authored Oct 29, 2024
1 parent b18dc8e commit 9fe5fb5
Show file tree
Hide file tree
Showing 11 changed files with 153 additions and 48 deletions.
27 changes: 21 additions & 6 deletions programming_examples/basic/vector_scalar_mul/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,50 @@ VPATH := ${srcdir}/../../../aie_kernels/aie2
targetname = vectorScalar
data_size = 4096
trace_size = 8192
CHESS ?= true

all: build/final_${data_size}.xclbin build/insts_${data_size}.txt

kristof: build/insts_${data_size}.txt

build/%.o: %.cc
mkdir -p ${@D}
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}
ifeq ($(CHESS), true)
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F};
else
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F};
endif

build/aie_${data_size}.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
python3 $< ${data_size} 0 > $@

build/aie_trace_${data_size}.mlir: aie2.py
build/aie_trace_${data_size}.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
python3 $< ${data_size} ${trace_size} > $@

#build/insts_${data_size}.txt: build/final_${data_size}.xclbin
build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
mkdir -p ${@D}
ifeq ($(CHESS), true)
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
else
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
endif

build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o
mkdir -p ${@D}
ifeq ($(CHESS), true)
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
else
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
endif

${targetname}_${data_size}.exe: ${srcdir}/test.cpp
rm -rf _build
Expand All @@ -66,11 +81,11 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt

trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json

trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json


clean_trace:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
//
// REQUIRES: ryzen_ai, peano
//
// RUN: mkdir -p test_peano
// RUN: cd test_peano
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile
// RUN: env CHESS=false make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace | FileCheck %s
// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
// CHECK: PASS!
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, chess
//
// RUN: mkdir -p test_chess
// RUN: cd test_chess
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=true make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace | FileCheck %s
// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
// CHECK: PASS!
19 changes: 14 additions & 5 deletions programming_guide/section-4/section-4b/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

include ${srcdir}/../../../programming_examples/makefile-common


all: build/final.xclbin

targetname = myFirstProgram

trace_size = 8192
CHESS ?= true

build/aie.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
Expand All @@ -26,18 +27,26 @@ build/aie_trace.mlir: ${srcdir}/aie2.py

build/scale.o: ${srcdir}/vector_scalar_mul.cc
mkdir -p ${@D}
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}
ifeq ($(CHESS), true)
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F};
else
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F};
endif

build/final.xclbin: build/aie.mlir build/scale.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
$(if $(shell [ $(CHESS) != true ] && echo true), \
--no-xchesscc --no-xbridge \
) \
--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)

build/trace.xclbin: build/aie_trace.mlir build/scale.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--no-xchesscc --no-xbridge \
cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
$(if $(shell [ $(CHESS) != true ] && echo true), \
--no-xchesscc --no-xbridge \
) \
--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: ${srcdir}/test.cpp
Expand Down
30 changes: 21 additions & 9 deletions programming_guide/section-4/section-4b/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,34 @@ Enabling trace support can be done with the following steps:
Enabling tracing means (1a) configuring the trace units for a given tile and then (1b) routing the generated events packets through the stream switches to the shim DMA where we can write them to a buffer in DDR for post-runtime processing.

### <u>(1a) Configure trace units for an AIE tile</u>
The first necessary component for trace configuration is setting the right values for the trace control registers for each tile that we want to enable tracing for. In addition, the generated trace packets will need to be routed to shimDMA and then written to one of the 3 inout buffers. We have abstracted these two steps with the python wrapper function `configure_simple_tracing_aie2` which is in [python/utils/test.py](../../../python/utils/test.py) and is described in more detail the [README](../../../python/utils) under `python/utils`. An example of how this function is used is shown below for quick reference
The first necessary component for trace configuration is setting the right values for the trace control registers for each tile that we want to enable tracing for. In addition, the generated trace packets will need to be routed to shimDMA and then written to one of the 3 inout buffers. We have abstracted these two steps with the python wrapper function `configure_packet_tracing_aie2` which is in [python/utils/test.py](../../../python/utils/test.py) and is described in more detail the [README](../../../python/utils) under `python/utils`. An example of how this function is used is shown below for quick reference
```python
trace_utils.configure_simple_tracing_aie2(
ComputeTile2,
ShimTile,
ddr_id=2,
size=traceSizeInBytes,
offset=tensorSize,
)
trace_utils.configure_packet_tracing_aie2(tiles_to_trace, ShimTile, opts.trace_size, 4096*4)
```
The arguments for this example are
* *tiles_to_trace* - array of compute tiles we want to trace
* *ShimTile* - shim tile that the trace is going out to
* *opts.trace_size* - the trace buffer size in bytes
* *4096*4* - the output buffer offset in bytes where the trace data begins

This block is defined within the sequence definition for `@runtime_sequence` where we define the shimDMA data movement to the 3 inout buffers.
> **Note** This simplification works very well for the trace buffer from a single tile to the shimDMA. However, if we want to do something more advaned like allocating the trace buffer from multiple tiles into a single larger buffer, this function will not be able to express that. For that, please consult the [README](../../../python/utils) under `python/utils` for more guidance on how to customize the trace configuration.
> **Note** This simplified wrapper is an enahnced version of the simpler `configure_simple_tracing_aie2` used previously which routed the trace from a single compute tile using circuit switched routing. This enhanced version relies on packet swtiched routing and supports tracing from multiple tiles by synchronizing the start event for each tile's trace unit to a user generated event. More details can be found in the [README](../../../python/utils) under `python/utils` for more guidance on how to customize the trace configuration.
### <u>(1b) Define trace event routes from tile to shimDMA</u>
Once the trace units and shimDMA are configured, we need to define how the trace packets are routed from compute tile to shim tile. This is done via circuit switched flows or packet switched flows as described below. Note that trace units in the MemTile and ShimTile can also be configured and routed.

We can simplify the defining the packet switched flows for the tiles we're tracing with the function `configure_packet_tracing_flow` defined in [python/utils/test.py](../../../python/utils/test.py) and is described in more detail the [README](../../../python/utils) under `python/utils`. An example of how this function is used is shown below for quick reference
```python
trace_utils.configure_packet_tracing_flow(tiles_to_trace, ShimTile)
```
The arguments for this example are
* *tiles_to_trace* - array of compute tiles we want to trace
* *ShimTile* - shim tile that the trace is going out to

> **Note** The synchronization of this function with the previous is `configure_packet_tracing_aie` is important because we track the route IDs and bd numbers of each configured trace. Do not mix and match these with circuit switched routing as they are intended to work together as a packet tracing pair.
More details about the mechanics for circuit and packet switched flows is described below if interested. Otherwise, you can skip ahead to 2. Configure host code to read trace data and write it to a text file.

#### <u>Circuit switched flows</u>
An example of a simple circuit switch routing flow to route trace event packets from a compute tile to a shimDMA would be:

Expand Down
13 changes: 5 additions & 8 deletions programming_guide/section-4/section-4b/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,17 @@ def core_body():
of_out.release(ObjectFifoPort.Produce, 1)
of_factor.release(ObjectFifoPort.Consume, 1)

# Set up a circuit-switched flow from core to shim for tracing information
# Set up a packet-switched flow from core to shim for tracing information
tiles_to_trace = [ComputeTile2]
if enableTrace:
flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
trace_utils.configure_packet_tracing_flow(tiles_to_trace, ShimTile)

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
def sequence(A, F, C):
if enableTrace:
trace_utils.configure_simple_tracing_aie2(
ComputeTile2,
ShimTile,
ddr_id=2,
size=trace_size,
offset=4096 * 4, # offset in bytes
trace_utils.configure_packet_tracing_aie2(
tiles_to_trace, ShimTile, opts.trace_size, 4096 * 4
)

npu_dma_memcpy_nd(
Expand Down
25 changes: 14 additions & 11 deletions programming_guide/section-4/section-4b/run_makefile.lit
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
// RUN: make -f %S/Makefile clean
// RUN: %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
// CHECK: PASS!
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: mkdir -p test_peano
// RUN: cd test_peano
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=false make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
// make -f %S/Makefile clean
// env CHESS=false %run_on_npu make -f %S/Makefile trace | FileCheck %s
// env CHESS=false %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
// CHECK: PASS!
16 changes: 16 additions & 0 deletions programming_guide/section-4/section-4b/run_makefile_chess.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, chess
//
// RUN: mkdir -p test_chess
// RUN: cd test_chess
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=true make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
// RUN: %run_on_npu make -f %S/Makefile trace | FileCheck %s
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace | FileCheck %s
// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
// CHECK: PASS!
4 changes: 3 additions & 1 deletion programming_guide/section-4/section-4b/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ int main(int argc, const char *argv[]) {
constexpr int IN_SIZE = IN_VOLUME * sizeof(DATATYPE);
int OUT_SIZE = IN_SIZE + trace_size;

std::cout << "IN_SIZE: " << IN_SIZE << ", OUT_SIZE: " << OUT_SIZE;

// Load instruction sequence
std::vector<uint32_t> instr_v =
test_utils::load_instr_sequence(vm["instr"].as<std::string>());
Expand Down Expand Up @@ -163,7 +165,7 @@ int main(int argc, const char *argv[]) {
}

// Write trace values if trace_size > 0
if (trace_size > 0) {
if (trace_size > 0 and iter == 0) {
test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size,
vm["trace_file"].as<std::string>());
}
Expand Down
17 changes: 11 additions & 6 deletions programming_guide/section-4/section-4b/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def main(opts):
npu_time_min = 9999999
npu_time_max = 0
errors = 0
enable_trace = True if opts.trace_size > 0 else False

# ------------------------------------------------------
# Main run loop
Expand All @@ -98,25 +99,29 @@ def main(opts):
continue

# Copy output results and verify they are correct
entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
output_buffer = entire_buffer[:INOUT2_VOLUME]
full_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
output_buffer = full_buffer[:INOUT2_VOLUME]
if opts.verify:
if opts.verbosity >= 1:
print("Verifying results ...")
ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
e = np.equal(output_buffer, ref)
errors = errors + np.size(e) - np.count_nonzero(e)

# Write trace values if trace_size > 0
if opts.trace_size > 0:
trace_buffer = entire_buffer[INOUT2_VOLUME:]
trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))
# Write trace values enable_trace is True
if enable_trace:
if i == 0:
trace_buffer = full_buffer[INOUT2_VOLUME:].view(np.uint32)

npu_time = stop - start
npu_time_total = npu_time_total + npu_time
npu_time_min = min(npu_time_min, npu_time)
npu_time_max = max(npu_time_max, npu_time)

# Write trace results
if enable_trace:
trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))

# ------------------------------------------------------
# Print verification and timing results
# ------------------------------------------------------
Expand Down
28 changes: 27 additions & 1 deletion python/utils/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,32 @@ Test/ Host code utilities.
* The trace buffer should be stored as one uint32_t per indices/line
* `pack4bytes`
* Pack 4 bytes into a 32-bit word
* `configure_simple_tracing_aie2`
* `configure_packet_tracing_aie2` (packet switched multi-tile tracing)
* This function abstracts a number of functions for configuring multiple core tiles and an associated shim tile. It does not define trace packet routing between them, which `configure_packet_tracing_flow` does. These two functions should be used together.

Function arguments:
* `tiles to trace` - array of tiles to trace
* `shim tile` - Single shim tile to configure for writing trace packets to DDR
* `size` - trace buffer size (in bytes)
* `offset` - offest (in bytes) where trace buffer data should begin

An example use case would be:
```python
trace_utils.configure_packet_tracing_aie2(tile_to_trace, ShimTile, opts.trace_size, 4096 * 4)
```
* `configure_packet_tracing_flow` (packet switched flows)
* This function automates the declaration of packet flows for an array of tiles to trace to the target shim tile. Note this function makes assumptions about packet routing IDs and this needs to match on shim config side which is done with `configure_packet_tracing_aie`.

Function arguments:
* `tiles to trace` - array of tiles to trace
* `shim tile` - Single shim tile to configure for writing trace packets to DDR

An example use case would be:
```python
trace_utils.configure_packet_tracing_flows(tile_to_trace, ShimTile)
```

* `configure_simple_tracing_aie2` (cicuit switched single tile tracing)
* This function abstracts a number of python functions for configuring a core tile and an associated shim tile. It does not define the trace packet routing between the two however.

Function arguments:
Expand Down Expand Up @@ -97,6 +122,7 @@ Test/ Host code utilities.
This one allows us to control the size, offset, and inout buffer mapping.

To better appreciate what this wrapper function does, we need to delve more deeply into the details on how trace units are configured.
* Additional helper functions can be found in the `trace.py` and are documented in the source directly.

### Available Events for Tracing - `trace_events_enum.py`

Expand Down

0 comments on commit 9fe5fb5

Please sign in to comment.