Skip to content

Commit

Permalink
Fix trace for mm. Change default to vec and trace on. (#1896)
Browse files Browse the repository at this point in the history
Co-authored-by: Joseph Melber <[email protected]>
  • Loading branch information
jackl-xilinx and jgmelber authored Nov 3, 2024
1 parent d0d2277 commit 44095f1
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def main():
argparser.add_argument(
"--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i32"
)
argparser.add_argument("--trace_size", type=int, default=0)
args = argparser.parse_args()
with mlir_mod_ctx() as ctx:
my_matmul(
Expand All @@ -53,6 +54,7 @@ def main():
args.n_aie_cols,
args.dtype_in,
args.dtype_out,
args.trace_size,
)
# print(ctx.module.operation.verify())
print(ctx.module)
Expand All @@ -62,7 +64,7 @@ def ceildiv(a, b):
return (a + b - 1) // b


def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str):
def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, trace_size):

n_aie_rows = 4
n_aie_cores = n_aie_rows * n_aie_cols
Expand Down
30 changes: 23 additions & 7 deletions programming_examples/basic/matrix_multiplication/makefile-common
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ K?=512
N?=512
dtype_in?=i16
dtype_out?=i32
trace_size?=65536


ifeq ($(dtype_in),bf16)
dtype_in_cpp=std::bfloat16_t
Expand Down Expand Up @@ -70,11 +72,11 @@ ifeq ($(dtype_out),i8)
dtype_acc_cpp=int8_t
endif

trace_size?=65536

target_suffix?=${M}x${K}x${N}
mlir_target?=build/aie_${target_suffix}.mlir
trace_mlir_target?=build/aie_trace_${target_suffix}.mlir
xclbin_target?=build/final_${target_suffix}.xclbin
trace_xclbin_target?=build/trace_${target_suffix}.xclbin
insts_target?=build/insts_${target_suffix}.txt
aie_py_src?=aie2.py

Expand All @@ -94,7 +96,11 @@ build/%.o: ${kernels_dir}/%.cc

${mlir_target}: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${aieargs} > $@
python3 $< ${aieargs} --trace_size 0 > $@

${trace_mlir_target}: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${aieargs} --trace_size ${trace_size} > $@

${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
mkdir -p ${@D}
Expand All @@ -104,6 +110,14 @@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
) \
--aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%)

${trace_xclbin_target}: ${trace_mlir_target} ${kernels:%=build/%.o}
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
$(if $(shell [ $(CHESS) != true ] && echo true), \
--no-xchesscc --no-xbridge --peano ${PEANO_INSTALL_DIR}, \
) \
--aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%)

${targetname}.exe: ${srcdir}/test.cpp ${srcdir}/../test.cpp ${srcdir}/../common.h
rm -rf _build
mkdir -p _build
Expand All @@ -126,14 +140,16 @@ run: ${targetname}.exe ${xclbin_target}
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs}

trace: ${targetname}.exe ${xclbin_target} ${insts_target}
trace: ${targetname}.exe ${trace_xclbin_target} ${insts_target}
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs} -t ${trace_size}
../../../utils/parse_trace.py --filename trace.txt --mlir ${trace_mlir_target} --colshift 1 > trace_mm.json

# ${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}

.PHONY: parse_trace
parse_trace:
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > trace_mm.json
../../../utils/parse_trace.py --filename trace.txt --mlir ${trace_mlir_target} --colshift 1 > trace_mm.json

.PHONY: clean
clean: clean_trace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,26 @@ def main():
choices=["bf16", "i8", "i16", "f32", "i32"],
default="i32",
)
argparser.add_argument("--trace_size", type=int, default=0)
args = argparser.parse_args()
my_matmul(
args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out
args.M,
args.K,
args.N,
args.m,
args.k,
args.n,
args.dtype_in,
args.dtype_out,
args.trace_size,
)


def ceildiv(a, b):
return (a + b - 1) // b


def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, trace_size):

assert M % m == 0
assert K % k == 0
Expand All @@ -79,8 +88,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
assert n % t == 0

vectorized = True
enable_tracing = False
trace_size = 65536
enable_tracing = True if trace_size > 0 else False

dtype_in = dtype_map[dtype_in_str]
dtype_out = dtype_map[dtype_out_str]
Expand Down Expand Up @@ -109,7 +117,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):

with mlir_mod_ctx() as ctx:

C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize // 8
C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize

@device(AIEDevice.npu1_1col)
def device_body():
Expand Down Expand Up @@ -195,9 +203,10 @@ def device_body():
)
object_fifo_link(memC, outC)

# Set up a circuit-switched flow from core to shim for tracing information
if enable_tracing:
flow(compute_tile2, WireBundle.Trace, 0, shim_tile, WireBundle.DMA, 1)
# Set up a packet-switched flow from core to shim for tracing information
tiles_to_trace = [compute_tile2]
if trace_size > 0:
trace_utils.configure_packet_tracing_flow(tiles_to_trace, shim_tile)

# Set up compute tiles

Expand Down Expand Up @@ -230,34 +239,8 @@ def core_body():
def sequence(A, B, C):

if enable_tracing:
trace_utils.configure_simple_tracing_aie2(
compute_tile2,
shim_tile,
ddr_id=2,
size=trace_size,
offset=C_sz_in_bytes,
events=[
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_0,
port_number=1,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_1,
port_number=2,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_2,
port_number=5,
master=True,
),
trace_utils.CoreEvent.INSTR_EVENT_0,
trace_utils.CoreEvent.INSTR_EVENT_1,
trace_utils.CoreEvent.MEMORY_STALL,
trace_utils.CoreEvent.LOCK_STALL,
trace_utils.CoreEvent.INSTR_VECTOR,
],
trace_utils.configure_packet_tracing_aie2(
tiles_to_trace, shim_tile, trace_size, C_sz_in_bytes
)

# only do 4 tile rows at a time before synchronizing, so we can reuse BDs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,26 @@ def main():
choices=["bf16", "i8", "i16", "f32", "i32"],
default="i32",
)
argparser.add_argument("--trace_size", type=int, default=0)
args = argparser.parse_args()
my_matmul(
args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out
args.M,
args.K,
args.N,
args.m,
args.k,
args.n,
args.dtype_in,
args.dtype_out,
args.trace_size,
)


def ceildiv(a, b):
return (a + b - 1) // b


def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, trace_size):

assert M % m == 0
assert K % k == 0
Expand All @@ -83,8 +92,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
assert n % t == 0

vectorized = True
enable_tracing = False
trace_size = 65536
enable_tracing = True if trace_size > 0 else False

dtype_in = dtype_map[dtype_in_str]
dtype_out = dtype_map[dtype_out_str]
Expand Down Expand Up @@ -113,7 +121,7 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):

with mlir_mod_ctx() as ctx:

C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize // 8
C_sz_in_bytes = C_sz * np.dtype(dtype_out).itemsize

@device(AIEDevice.npu1_1col)
def device_body():
Expand Down Expand Up @@ -199,9 +207,10 @@ def device_body():
)
object_fifo_link(memC, outC)

# Set up a circuit-switched flow from core to shim for tracing information
if enable_tracing:
flow(compute_tile2, WireBundle.Trace, 0, shim_tile, WireBundle.DMA, 1)
# Set up a packet-switched flow from core to shim for tracing information
tiles_to_trace = [compute_tile2]
if trace_size > 0:
trace_utils.configure_packet_tracing_flow(tiles_to_trace, shim_tile)

# Set up compute tiles

Expand Down Expand Up @@ -233,34 +242,8 @@ def core_body():
def sequence(A, B, C):

if enable_tracing:
trace_utils.configure_simple_tracing_aie2(
compute_tile2,
shim_tile,
ddr_id=2,
size=trace_size,
offset=C_sz_in_bytes,
events=[
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_0,
port_number=1,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_1,
port_number=2,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_2,
port_number=5,
master=True,
),
trace_utils.CoreEvent.INSTR_EVENT_0,
trace_utils.CoreEvent.INSTR_EVENT_1,
trace_utils.CoreEvent.MEMORY_STALL,
trace_utils.CoreEvent.LOCK_STALL,
trace_utils.CoreEvent.INSTR_VECTOR,
],
trace_utils.configure_packet_tracing_aie2(
tiles_to_trace, shim_tile, trace_size, C_sz_in_bytes
)

# These lists will hold handles to the DMA tasks we configure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def main():
choices=["bf16", "i8", "i16", "f32", "i32"],
default="i16",
)
argparser.add_argument("--trace_size", type=int, default=0)
args = argparser.parse_args()
with mlir_mod_ctx() as ctx:
my_matmul(
Expand All @@ -59,6 +60,7 @@ def main():
args.dtype_in,
args.dtype_out,
args.b_col_maj,
args.trace_size,
)
# print(ctx.module.operation.verify())
print(ctx.module)
Expand All @@ -68,7 +70,9 @@ def ceildiv(a, b):
return (a + b - 1) // b


def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj):
def my_matmul(
M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str, b_col_maj, trace_size
):

n_aie_rows = 4
n_aie_cores = n_aie_rows * n_aie_cols
Expand Down
4 changes: 2 additions & 2 deletions programming_guide/section-4/section-4c/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ Looking at this table, we quickly see that the data movement is the bottleneck f

Mouse over the blocks of PortRuning0 and PortRunning1, what is the measured number of cycles per chunk? <img src="../../../mlir_tutorials/images/answer1.jpg" title="512 cycles" height=25> This matches what we expected to see. But note how it's obvious from the waveform how dominant data movement is as compared to compute.

1. We can already see that our design is inbalanced between data movement and compute where we have 72 cycles for compute and 512 cycles for data movement. Let's take a look at the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and see if we can do better. In the description, it talks about each iteration of the kernel is by default configured for MxKxN values of 64x64x64 giving us 262,144 MACs. Given that we're working with `int16_t` datatype which has 64 MACs per clock, how many cycles will the ideal case take? <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 262,144/ 64" height=25> Given that the A and B matrix are each 64x64 x `int16_t` and our stream switch channels are are 32-bits wide, how many cycles does it take to move data to the compute tile (bear in mind A and B can be moved in parallel via separate channels). <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 64x64/2" height=25>
1. We can already see that our design is inbalanced between data movement and compute where we have 72 cycles for compute and 512 cycles for data movement. Let's take a look at the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/single_core) and see if we can do better. In the description, it talks about each iteration of the kernel is by default configured for MxKxN values of 64x64x64 giving us 262,144 MACs. Given that we're working with `int16_t` datatype which has 64 MACs per clock, how many cycles will the ideal case take? <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 262,144/ 64" height=25> Given that the A and B matrix are each 64x64 x `int16_t` and our stream switch channels are are 32-bits wide, how many cycles does it take to move data to the compute tile (bear in mind A and B can be moved in parallel via separate channels). <img src="../../../mlir_tutorials/images/answer1.jpg" title="2048 cycles = 64x64/2" height=25>

1. So this example should be perfectly balanced between compute and data movement! Navigate to the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/) and run the trace build (`make clean; make trace`). Then open the generated waveform json (`trace_mm.json`) and measure the delta between `event 0` and `event 1` in the first run. What value did you get and how close is it to ideal? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~2535 cycles which is 80% of 2048" height=25> You should now see that both the compute cycles and the data movement cycles are much more closely matched!
1. So this example should be perfectly balanced between compute and data movement! Navigate to the [Matrix Multiply Example](../../../programming_examples/basic/matrix_multiplication/single_core) and run the trace build (`make clean; make -f Makefile.chess trace`). Then open the generated waveform json (`trace_mm.json`) and measure the delta between `event 0` and `event 1` in the first run. What value did you get and how close is it to ideal? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~2535 cycles which is 80% of 2048" height=25> You should now see that both the compute cycles and the data movement cycles are much more closely matched!

## <u>Diving Deep - Examining the Microcode</u>
Let's take another look at the results of our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/). Let's also go back one step and comment out `chess_prepare_for_pipelining chess_loop_range(16, )` and rerun the compilation (`make clean; make trace`).
Expand Down

0 comments on commit 44095f1

Please sign in to comment.