From 15fe206f78dff2a8a10f5100a85e62fe550e9c4b Mon Sep 17 00:00:00 2001 From: Jack Lo <36210336+jackl-xilinx@users.noreply.github.com> Date: Wed, 17 Apr 2024 09:10:54 -0700 Subject: [PATCH] Moved test_utils.py to utils/test.py (#1270) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../basic/eltwise_exp/test.py | 3 +- .../basic/eltwise_mul/test.py | 3 +- programming_examples/ml/eltwise_add/test.py | 3 +- python/CMakeLists.txt | 9 + python/test_utils.py | 67 --- python/utils/ml.py | 440 ++++++++++++++++++ python/utils/test.py | 168 +++++++ python/utils/trace.py | 186 ++++++++ python/utils/xrt.py | 162 +++++++ 9 files changed, 968 insertions(+), 73 deletions(-) delete mode 100644 python/test_utils.py create mode 100644 python/utils/ml.py create mode 100644 python/utils/test.py create mode 100644 python/utils/trace.py create mode 100644 python/utils/xrt.py diff --git a/programming_examples/basic/eltwise_exp/test.py b/programming_examples/basic/eltwise_exp/test.py index cc132020d9..315b0c46c4 100644 --- a/programming_examples/basic/eltwise_exp/test.py +++ b/programming_examples/basic/eltwise_exp/test.py @@ -9,8 +9,7 @@ import sys import time -sys.path.append("../../programming_examples/utils") -import test_utils +import aie.utils.test as test_utils # ------------------------------------------------------ # Configure this to match your design's buffer size diff --git a/programming_examples/basic/eltwise_mul/test.py b/programming_examples/basic/eltwise_mul/test.py index 6bb9636093..1e6028a504 100644 --- a/programming_examples/basic/eltwise_mul/test.py +++ b/programming_examples/basic/eltwise_mul/test.py @@ -11,8 +11,7 @@ import sys import time -sys.path.append("../../python") -import test_utils +import aie.utils.test as test_utils # ------------------------------------------------------ # Configure this to match your design's buffer size diff --git a/programming_examples/ml/eltwise_add/test.py b/programming_examples/ml/eltwise_add/test.py index b53b00db20..ad433460f1 100644 --- a/programming_examples/ml/eltwise_add/test.py +++ b/programming_examples/ml/eltwise_add/test.py @@ -12,8 +12,7 @@ import sys import time -sys.path.append("../../programming_examples/utils") -import test_utils +import aie.utils.test as test_utils # ------------------------------------------------------ # Configure this to match your design's buffer size diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index af11f43567..59d89d38a5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -26,6 +26,15 @@ declare_mlir_python_sources(AIEPythonSources.Util util.py ) +declare_mlir_python_sources(AIEPythonSources.Utils + ADD_TO_PARENT AIEPythonSources + SOURCES + utils/test.py + utils/xrt.py + utils/ml.py + utils/trace.py +) + declare_mlir_python_sources(AIEPythonSources.Extras ADD_TO_PARENT AIEPythonSources SOURCES_GLOB diff --git a/python/test_utils.py b/python/test_utils.py deleted file mode 100644 index 5e187f6330..0000000000 --- a/python/test_utils.py +++ /dev/null @@ -1,67 +0,0 @@ -# test_utils.py -*- Python -*- -# -# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. -# SPDX-License-Identifier: MIT - -import argparse - - -# options -def parse_args(args): - p = argparse.ArgumentParser() - p.add_argument( - "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path" - ) - p.add_argument( - "-k", - "--kernel", - required=True, - dest="kernel", - default="MLIR_AIE", - help="the kernel name in the XCLBIN (for instance MLIR_AIE)", - ) - p.add_argument( - "-v", "--verbosity", default=0, type=int, help="the verbosity of the output" - ) - p.add_argument( - "-i", - "--instr", - dest="instr", - default="instr.txt", - help="path of file containing userspace instructions sent to the NPU", - ) - p.add_argument( - "--verify", - dest="verify", - default=True, - help="whether to verify the AIE computed output", - ) - p.add_argument( - "--iters", - dest="iters", - default=1, - type=int, - help="number of benchmark iterations", - ) - p.add_argument( - "--warmup", - dest="warmup_iters", - default=0, - type=int, - help="number of warmup iterations", - ) - p.add_argument( - "-t", - "--trace_sz", - dest="trace_size", - default=0, - type=int, - help="trace size in bytes", - ) - p.add_argument( - "--trace_file", - dest="trace_file", - default="trace.txt", - help="where to store trace output", - ) - return p.parse_args(args) diff --git a/python/utils/ml.py b/python/utils/ml.py new file mode 100644 index 0000000000..527ced00e9 --- /dev/null +++ b/python/utils/ml.py @@ -0,0 +1,440 @@ +# ml.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +import os +from torch.utils.data import Dataset + +# from PIL import Image +import json +import sys +import csv +import json +import argparse +import numpy as np + +# import cv2 +import numpy as np + +# from PIL import Image +from pathlib import Path +import torch +import torch.nn as nn + +# from prettytable import PrettyTable +import math + + +# class ImageNetKaggle(Dataset): +# def __init__(self, root, split, transform=None): +# self.samples = [] +# self.targets = [] +# self.transform = transform +# self.syn_to_class = {} +# with open(os.path.join(root, "imagenet_class_index.json"), "rb") as f: +# json_file = json.load(f) +# for class_id, v in json_file.items(): +# self.syn_to_class[v[0]] = int(class_id) +# with open(os.path.join(root, "ILSVRC2012_val_labels.json"), "rb") as f: +# self.val_to_syn = json.load(f) +# samples_dir = os.path.join(root, "ILSVRC/Data/CLS-LOC", split) +# for entry in os.listdir(samples_dir): +# if split == "train": +# syn_id = entry +# target = self.syn_to_class[syn_id] +# syn_folder = os.path.join(samples_dir, syn_id) +# for sample in os.listdir(syn_folder): +# sample_path = os.path.join(syn_folder, sample) +# self.samples.append(sample_path) +# self.targets.append(target) +# elif split == "val": +# syn_id = self.val_to_syn[entry] +# target = self.syn_to_class[syn_id] +# sample_path = os.path.join(samples_dir, entry) +# self.samples.append(sample_path) +# self.targets.append(target) + +# def __len__(self): +# return len(self.samples) + +# def __getitem__(self, idx): +# x = Image.open(self.samples[idx]).convert("RGB") +# if self.transform: +# x = self.transform(x) +# return x, self.targets[idx] + + +class CSVLogger: + def __init__(self, filename, sep=","): + self.filename = str(filename) + if os.path.exists(self.filename): + with open(self.filename) as f: + self.columns = csv.DictReader(f).fieldnames + else: + self.columns = None + self.fh = open(self.filename, "a", newline="") + self.csvwriter = csv.writer(self.fh, delimiter=sep) + self.count = 0 + + def set_columns(self, columns): + if self.columns: + raise Exception("Columns already set") + self.columns = list(columns) + self.csvwriter.writerow(self.columns) + + def append(self, row): + if self.columns is None: + self.set_columns(row.keys()) + self.csvwriter.writerow([row.get(k, "-") for k in self.columns]) + self.count += 1 + if self.count > 100: + self.count = 0 + self.fh.flush() + + def close(self): + self.fh.close() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + +def load_class_label(class_label_file: str, num_classes: int) -> list: + class_label = json.load(open(class_label_file)) + class_label_list = [class_label[str(i)] for i in range(num_classes)] + + return class_label_list + + +# def count_parameters(model): +# table = PrettyTable(["Modules", "Parameters"]) +# total_params = 0 +# for name, parameter in model.named_parameters(): +# if not parameter.requires_grad: +# continue +# param = parameter.numel() +# table.add_row([name, param]) +# total_params += param +# print(table) +# print(f"Total Trainable Params: {total_params}") +# return total_params + + +def unpickle(file): + import pickle + + with open(file, "rb") as fo: + dict = pickle.load(fo, encoding="latin1") + return dict + + +# def extract_cifar(): +# datafile = r"./data_torchvision/cifar-10-batches-py/test_batch" +# metafile = r"./data_torchvision/cifar-10-batches-py/batches.meta" + +# data_batch_1 = unpickle(datafile) +# metadata = unpickle(metafile) + +# images = data_batch_1["data"] +# labels = data_batch_1["labels"] +# images = np.reshape(images, (10000, 3, 32, 32)) + +# import os + +# dirname = "cifar_images" +# if not os.path.exists(dirname): +# os.mkdir(dirname) + +# # Extract and dump first 10 images +# for i in range(0, 100): +# im = images[i] +# im = im.transpose(1, 2, 0) +# im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) +# im_name = f"./cifar_images/image_{i}.png" +# cv2.imwrite(im_name, im) + + +def fuse_single_conv_bn_pair(bn_mean, bn_var, bn_wts, bn_bias, conv_wts): + # https://github.com/ChoiDM/Pytorch_BN_Fold/blob/master/bn_fold.py + eps = 1e-05 + mu = bn_mean + var = bn_var + gamma = bn_wts + # if 'bias' in bn_bias: + # beta = bn_bias + # else: + # beta = torch.zeros(gamma.size(0)).float() + beta = bn_bias + # Conv params + W = conv_wts + + denom = torch.sqrt(var + eps) + + A = gamma.div(denom) + # bias = torch.zeros(W.size(0)).float() + # b = beta - gamma.mul(mu).div(denom) + # bias *= A + A = A.expand_as(W.transpose(0, -1)).transpose(0, -1) + A = A.to(torch.int8) + W.mul_(A) + # bias.add_(b) + + return W + + +class DataShaper: + def __init__(self, defOrder="RC", print_info=False): + self.defOrder = defOrder + self.print_info = print_info + self.log_msg = [] + + def _reorder_granularity_range( + self, order, z, start_data_dim=-1, stop_str_dim=None + ): + if stop_str_dim is None: + stop_str_dim = len(order) + gran = 1 + pre_group = {} + for idx, s in enumerate(order.split(z)[1:]): + step = "" + pg = "" + off = 0 + if s.find(")") >= 0 and (s.find("(") > s.find(")") or s.find("(") < 0): + pg, s = s.split(")", 1) + for i, c in enumerate(s): + if c.isdigit(): + step += c + else: + if c == ">": + off = int(s[i + 1]) + if c == "<": + off = -int(s[i + 1]) + break + if len(pg) > 0 and pg[0] in "<>": + if pg[0] == ">": + off = int(pg[1]) + if pg[0] == "<": + off = -int(pg[1]) + pg = pg[2:] + if step and idx + off > start_data_dim and idx <= stop_str_dim: + gran *= int(step) + for p in pg: + if p.isdigit() or p in "<>": + continue + elif p in pre_group: + pre_group[p] *= int(step) + else: + pre_group[p] = int(step) + return gran, pre_group + + def _reorder_decode(self, shape, order, defOrder=None): + if not defOrder: + defOrder = self.defOrder + Ds = [order.count(c) for c in defOrder] + D = list(shape) + size = [0] * sum(Ds) + perm = [0] * sum(Ds) + pad_im = [0] * len(shape) + pad_ex = [0] * sum(Ds) + brdcst = [1] * sum(Ds) + align = [1] * sum(Ds) + val = "" + val_gi = "" + off = 0 + group = False + d = [sum(Ds[0 : i + 1]) - 1 for i in range(len(Ds))] + p = sum(Ds) - 1 + for z in reversed(order): + if z.isdigit(): + if group: + val_gi = z + val_gi + else: + val = z + val + elif z == ">": + if group: + off = int(val_gi) + val_gi = "" + else: + off = int(val) + val = "" + elif z == "<": + if group: + off = -int(val_gi) + val_gi = "" + else: + off = -int(val) + val = "" + elif z == ")": + group = True + elif z == "(": + group = False + off = 0 + val = "" + val_gi = "" + elif z == "%": # Pad dimension by N + pad_ex[p + 1] += max(0, int(val) - 1) * ( + size[perm[p + 1]] + pad_ex[p + 1] + ) + val = "" + elif z == "*": # Broadcast dimension by N + brdcst[p] *= int(val) + val = "" + elif z == "|": # Align data after a dimension to N + align[p] *= int(val) + val = "" + elif z in defOrder: + idx = defOrder.find(z) + perm[p] = d[idx] + off + if off < 0: + start_dim = d[idx] + off - sum(Ds[0:idx]) if val else -1 + stop_dim = d[idx] - sum(Ds[0:idx]) + gran, pre_group = self._reorder_granularity_range( + order, z, start_dim, stop_dim + ) + for i, c in enumerate(defOrder): + if c in pre_group: + if D[i] >= pre_group[c]: + gran //= pre_group[c] + elif D[i] > 1: + gran = int(math.ceil(1.0 * gran / D[i])) + D_rem = max(1, D[idx] // gran) + else: + D_rem = D[idx] + if val: + vi = int(val) + if group: + if vi > D_rem: + vi_rem = vi // D_rem + vi //= vi_rem + val = str(vi_rem) + else: + val = "1" + else: + val = "" + else: + vi = D_rem + if vi > 0: + if D[idx] % vi != 0: + dim_sub = np.prod( + np.maximum(1, size[sum(Ds[0:idx]) : sum(Ds[0 : idx + 1])]) + ) + pad_im[idx] += (vi - D[idx] % vi) * dim_sub + size[d[idx] + off] = vi + D[idx] = int(math.ceil(1.0 * D[idx] / vi)) + if not group: + off = 0 + d[idx] -= 1 + p -= 1 + if self.print_info: + self.log_msg.append( + "[INFO]: reorder s={:<15} o={:<15} -> pi={:<15} s={:<30} p={:<30} pe={:<30}, b={:<30}, a={:<30}".format( + *map(str, (shape, order, pad_im, size, perm, pad_ex, brdcst, align)) + ) + ) + return pad_im, size, perm, pad_ex, brdcst, align + + def reorder_mat(self, mat, order, defOrder=None, inverse=False): + pad_im, size, perm, pad_ex, brdcst, align = self._reorder_decode( + mat.shape, order, defOrder + ) + if not inverse: + if sum(pad_im) > 0: + mat = np.pad(mat, tuple(zip([0] * len(pad_im), pad_im)), "constant") + mat = mat.reshape(*size).transpose(perm) + if sum(pad_ex) > 0: + mat = np.pad(mat, tuple(zip([0] * len(pad_ex), pad_ex)), "constant") + if np.prod(brdcst) > 1: + for idx, b in enumerate(brdcst): + if b > 1: + mat = np.repeat(mat, b, axis=idx) + if np.prod(align) > 1: + for idx, a in reversed(tuple(enumerate(align))): + if a > 1: + mat = mat.reshape(mat.shape[: idx + 1] + (-1,)) + pad = a - (mat.shape[-1] % a) + if pad < a: + mp = np.zeros((len(mat.shape), 2), dtype=np.int) + mp[-1, -1] = pad + mat = np.pad(mat, mp, "constant") + else: + assert sum(pad_im) == 0, "Reverse of implicit padding not supported" + assert sum(pad_ex) == 0, "Reverse of explicit padding not supported" + assert np.prod(brdcst) == 1, "Reverse of broadcasting not supported" + assert np.prod(align) == 1, "Reverse of alignment not supported" + perm_inv = [perm.index(p) for p in range(len(perm))] + size_inv = [size[p] for p in perm] + mat = mat.reshape(*size_inv) + mat = mat.transpose(perm_inv) + + return mat.reshape(-1) + + def get_dim_steps( + self, shape, order, defOrder=None, bits=8, ebs=None, sparse_ratio=1 + ): + pad_im, size, perm, pad_ex, brdcst, align = self._reorder_decode( + shape, order, defOrder + ) + sz = 1 + d = len(shape) - 1 + sp = len(perm) - 1 + dim = [0] * len(shape) + for i, s in enumerate(reversed(size)): + sz *= s + if sz >= shape[d] + pad_im[d]: + # current dimension contains all elements + p = len(perm) - 1 - i + dim[d] = pi0 = perm.index(p) + if p + 1 < sp and p + 1 in perm: + pi1 = perm.index(p + 1) + if pi0 + 1 == pi1: + # Found XX coupling + dim[d] = pi1 + self.log_msg.append( + "INFO: Found XX coupling (order={}, size={}, perm={}, p={})".format( + order, size, perm, p + ) + ) + elif len(perm) > pi0 + 1: + pb = perm[pi0 + 1] + if pi0 + 2 == pi1 and size[pb] == 1: + # Found XNX sequence with N=1, simplify + dim[d] = pi1 + self.log_msg.append( + "INFO: Found XNX sequency with N=1, simplify (order={}, size={}, perm={}, p={})".format( + order, size, perm, p + ) + ) + sz = 1 + d -= 1 + sp = p + # dim = [perm.index(p) for p in dim] + size_inv = (np.array(size)[perm] + pad_ex) * brdcst + idx = -2 if bits == 4 and size_inv[-1] == 2 else -1 + if ebs or sparse_ratio: + assert ( + size_inv[idx] >= 8 + ), "Sparse/exponent block is too small. Data (order) unexpected or update to script is required" + size_inv[idx] = int( + size_inv[idx] * sparse_ratio * (bits - (8 if ebs else 0)) / 8 + ) + (size_inv[idx] // ebs if ebs else 0) + step = [0] * (len(shape) + 1) + cur = 1 + for i_rev, (s, al) in enumerate(reversed(tuple(zip(size_inv, align)))): + i = len(perm) - 1 - i_rev + if al > 1: + cur = ((cur + al - 1) // al) * al + if i in dim: + step[dim.index(i)] = cur + cur *= s + step[-1] = cur + return step + + +if __name__ == "__main__": + extract_cifar() diff --git a/python/utils/test.py b/python/utils/test.py new file mode 100644 index 0000000000..81c3e51e59 --- /dev/null +++ b/python/utils/test.py @@ -0,0 +1,168 @@ +# ml.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +import argparse +import pyxrt as xrt + + +# options +def parse_args(args): + p = argparse.ArgumentParser() + p.add_argument( + "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path" + ) + p.add_argument( + "-k", + "--kernel", + required=True, + dest="kernel", + default="MLIR_AIE", + help="the kernel name in the XCLBIN (for instance MLIR_AIE)", + ) + p.add_argument( + "-v", "--verbosity", default=0, type=int, help="the verbosity of the output" + ) + p.add_argument( + "-i", + "--instr", + dest="instr", + default="instr.txt", + help="path of file containing userspace instructions sent to the NPU", + ) + p.add_argument( + "--verify", + dest="verify", + default=True, + help="whether to verify the AIE computed output", + ) + p.add_argument( + "--iters", + dest="iters", + default=1, + type=int, + help="number of benchmark iterations", + ) + p.add_argument( + "--warmup", + dest="warmup_iters", + default=0, + type=int, + help="number of warmup iterations", + ) + p.add_argument( + "-t", + "--trace_sz", + dest="trace_size", + default=0, + type=int, + help="trace size in bytes", + ) + p.add_argument( + "--trace_file", + dest="trace_file", + default="trace.txt", + help="where to store trace output", + ) + return p.parse_args(args) + + +# Add default args to standard parser object +def create_default_argparser(): + p = argparse.ArgumentParser() + p.add_argument( + "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path" + ) + p.add_argument( + "-k", + "--kernel", + required=True, + dest="kernel", + default="MLIR_AIE", + help="the kernel name in the XCLBIN (for instance MLIR_AIE)", + ) + p.add_argument( + "-v", "--verbosity", default=0, type=int, help="the verbosity of the output" + ) + p.add_argument( + "-i", + "--instr", + dest="instr", + default="instr.txt", + help="path of file containing userspace instructions sent to the NPU", + ) + p.add_argument( + "--verify", + dest="verify", + default=True, + help="whether to verify the AIE computed output", + ) + p.add_argument( + "--iters", + dest="iters", + default=1, + type=int, + help="number of benchmark iterations", + ) + p.add_argument( + "--warmup", + dest="warmup_iters", + default=0, + type=int, + help="number of warmup iterations", + ) + p.add_argument( + "-t", + "--trace_sz", + dest="trace_size", + default=0, + type=int, + help="trace size in bytes", + ) + p.add_argument( + "--trace_file", + dest="trace_file", + default="trace.txt", + help="where to store trace output", + ) + return p + + +def write_out_trace(trace_buffer, trace_size, trace_file): + try: + with open(trace_file, "wt") as f: + f.write(trace_buffer) + except Exception as e: + print(e) + sys.exit(1) + + +def init_xrt_load_kernel(opts): + # Get a device handle + device = xrt.device(0) + + # Load the xclbin + xclbin = xrt.xclbin(opts.xclbin) + + # Load the kernel + kernels = xclbin.get_kernels() + try: + xkernel = [k for k in kernels if opts.kernel in k.get_name()][0] + except: + print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'") + exit(-1) + + # Register xclbin + device.register_xclbin(xclbin) + + # Get a hardware context + context = xrt.hw_context(device, xclbin.get_uuid()) + + # get a kernel handle + kernel = xrt.kernel(context, xkernel.get_name()) + + return (device, kernel) diff --git a/python/utils/trace.py b/python/utils/trace.py new file mode 100644 index 0000000000..8c3e97be87 --- /dev/null +++ b/python/utils/trace.py @@ -0,0 +1,186 @@ +# trace.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +from aie.dialects.aiex import * + + +def extract_trace(out_buf, out_buf_shape, out_buf_dtype, trace_size): + trace_size_words = trace_size // 4 + out_buf_flat = out_buf.reshape((-1,)).view(np.uint32) + output_prefix = ( + out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape) + ) + trace_suffix = out_buf_flat[-trace_size_words:] + return output_prefix, trace_suffix + + +def write_out_trace(trace, file_name): + out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0) + with open(file_name, "w") as f: + f.write(out_str) + + +# trace_utils.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +from aie.dialects.aiex import * + + +def pack4bytes(b3, b2, b1, b0): + w = (b3 & 0xFF) << 24 + w |= (b2 & 0xFF) << 16 + w |= (b1 & 0xFF) << 8 + w |= (b0 & 0xFF) << 0 + return w + + +# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md +# This is a very simple model of tracing, which has some big assumptions: +# 1) Trace data is collected over circuit switched connections, not packet-switched +# 2) A ShimDMA S2MM channel is dedicated to the trace data +# 3) Trace data is small enough to fit in a fixed-size buffer, which is collected with the +# outputs of the design +# 4) The usual model of '2 inputs, 1 output' is followed, and the +# trace data is appended to the other outputs + +# tile: The tile we're tracing +# shim: The shim tile to output data with. +# bd_id: The BD in the shim tile to use. +# channel: The S2MM channel to use (0 or 1). +# size: The size of the trace data +# offset: The offset of the trace data in the (single) output buffer. +# start: The event number to start tracing on +# stop: The event number to stop tracing on +# events: A list of events to trace. Up to 8 events are allowed in aie2, more are ignored + +# Some events: +# TRUE (0x01) +# STREAM_STALL (0x18) +# LOCK_STALL (0x1A) +# EVENTS_CORE_INSTR_EVENT_1 (0x22) +# EVENTS_CORE_INSTR_EVENT_0 (0x21) +# INSTR_VECTOR (0x25) Core executes a vecotr MAC, ADD or compare instruction +# INSTR_LOCK_ACQUIRE_REQ (0x2C) Core executes a lock acquire instruction +# INSTR_LOCK_RELEASE_REQ (0x2D) Core executes a lock release instruction +# EVENTS_CORE_PORT_RUNNING_1 (0x4F) +# EVENTS_CORE_PORT_RUNNING_0 (0x4B) + + +# Event numbers should be less than 128. +# Big assumption: The bd_id and channel are unused. If they are used by something else, then +# everything will probably break. +def configure_simple_tracing_aie2( + tile, shim, channel, bd_id, ddr_id, size, offset, start, stop, events +): + # Shim has to be a... shim. Also needs to be a NOC tile, but we don't have + # an easy way of checking that through python. + assert int(shim.row) == 0 + + # Pad the input so we have exactly 8 events. + events = (events + [0] * 8)[:8] + + # 0x340D0: Trace Control 0 + # 0xAABB---C + # AA <- Event to stop trace capture + # BB <- Event to start trace capture + # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution + # Configure so that "Event 1" (always true) causes tracing to start + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x340D0, + value=pack4bytes(stop, start, 0, 0), + ) + # 0x340D4: Trace Control 1 + # This is used to control packet routing. For the moment + # only deal with the simple case of circuit routing. + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x340D4, + value=0, + ) + # 0x340E0: Trace Event Group 1 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x340E0, + value=pack4bytes(*events[0:4]), + ) + # 0x340E4: Trace Event Group 2 (Which events to trace) + # 0xAABBCCDD AA, BB, CC, DD <- four event slots + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x340E4, + value=pack4bytes(*events[4:8]), + ) + + # 0x3FF00: Stream switch event port selection 0 + def master(port): + return port | (1 << 5) + + def slave(port): + return port + + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x3FF00, + value=pack4bytes(0, 0, slave(1), master(1)), # port 1 is FIFO0? + ) + ipu_write32( + column=int(tile.col), + row=int(tile.row), + address=0x3FF04, + value=pack4bytes(0, 0, 0, 0), + ) + + # Configure a buffer descriptor to write tracing information that has been routed into this shim tile + # out to host DDR memory + ipu_writebd_shimtile( + bd_id=bd_id, + buffer_length=size, + buffer_offset=offset, + enable_packet=0, + out_of_order_id=0, + packet_id=0, + packet_type=0, + column=int(shim.col), + column_num=1, + d0_size=0, + d0_stride=0, + d1_size=0, + d1_stride=0, + d2_stride=0, + ddr_id=ddr_id, + iteration_current=0, + iteration_size=0, + iteration_stride=0, + lock_acq_enable=0, + lock_acq_id=0, + lock_acq_val=0, + lock_rel_id=0, + lock_rel_val=0, + next_bd=0, + use_next_bd=0, + valid_bd=1, + ) + # configure S2MM channel + ipu_write32( + column=int(shim.col), + row=int(shim.row), + address=0x1D204 if channel == 0 else 0x1D20C, + value=bd_id, + ) diff --git a/python/utils/xrt.py b/python/utils/xrt.py new file mode 100644 index 0000000000..fa36ff096a --- /dev/null +++ b/python/utils/xrt.py @@ -0,0 +1,162 @@ +# ml.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +# from npu.runtime +import pyxrt as xrt + +# import npu.runtime as xrt +import numpy as np + + +class AIE_Application: + + def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"): + self.device = None + self.kernel = None + self.buffers = [None] * 8 + self.device = xrt.device(0) + + # Find kernel by name in the xclbin + self.xclbin = xrt.xclbin(xclbin_path) + kernels = self.xclbin.get_kernels() + try: + xkernel = [k for k in kernels if kernel_name == k.get_name()][0] + except KeyError: + raise AIE_Application_Error("No such kernel: " + kernel_name) + self.device.register_xclbin(self.xclbin) + self.context = xrt.hw_context(self.device, self.xclbin.get_uuid()) + self.kernel = xrt.kernel(self.context, xkernel.get_name()) + + ## Set up instruction stream + insts = read_insts(insts_path) + self.n_insts = len(insts) + self.insts_buffer = AIE_Buffer( + self, 0, insts.dtype, insts.shape, xrt.bo.cacheable + ) + self.insts_buffer.write(insts) + + def register_buffer(self, group_id, *args, **kwargs): + self.buffers[group_id] = AIE_Buffer(self, group_id, *args, **kwargs) + + def run(self): + self.insts_buffer.sync_to_device() + h = self.call() + h.wait() + + def call(self): + h = self.kernel( + self.insts_buffer.bo, + self.n_insts * 4, + *[b.bo for b in self.buffers if b is not None], + ) + return h + + def __del__(self): + del self.kernel + del self.device + + +class AIE_Buffer: + + def __init__(self, application, group_id, dtype, shape, flags=xrt.bo.host_only): + self.application = application + self.dtype = dtype + self.shape = shape + self.len_bytes = np.prod(shape) * np.dtype(dtype).itemsize + self.bo = xrt.bo( + application.device, + self.len_bytes, + flags, + application.kernel.group_id(group_id), + ) + + def read(self): + self.sync_from_device() + return self.bo.read(self.len_bytes, 0).view(self.dtype).reshape(self.shape) + + def write(self, v, offset=0): + self.bo.write(v.view(np.uint8), offset) + self.sync_to_device() + + def sync_to_device(self): + return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + def sync_from_device(self): + return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + + def __del__(self): + del self.bo + self.bo = None + + +class AIE_Application_Error(Exception): + pass + + +insts_cache = {} + + +def read_insts(insts_path): + global insts_cache + if insts_path in insts_cache: + # Speed up things if we re-configure the array a lot: Don't re-parse + # the insts.txt each time + return insts_cache[insts_path] + with open(insts_path, "r") as f: + insts_text = f.readlines() + insts_text = [l for l in insts_text if l != ""] + insts_v = np.array([int(c, 16) for c in insts_text], dtype=np.uint32) + insts_cache[insts_path] = insts_v + return insts_v + + +def setup_aie( + xclbin_path, + insts_path, + in_0_shape, + in_0_dtype, + in_1_shape, + in_1_dtype, + out_buf_shape, + out_buf_dtype, + enable_trace=False, + kernel_name="MLIR_AIE", + trace_size=16384, +): + app = AIE_Application(xclbin_path, insts_path, kernel_name) + app.register_buffer(2, shape=in_0_shape, dtype=in_0_dtype) + app.register_buffer(3, shape=in_1_shape, dtype=in_1_dtype) + if enable_trace: + out_buf_len_bytes = np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize + out_buf_shape = (out_buf_len_bytes + trace_size,) + out_buf_dtype = np.uint8 + app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype) + return app + + +def extract_trace(out_buf, out_buf_shape, out_buf_dtype, trace_size): + trace_size_words = trace_size // 4 + out_buf_flat = out_buf.reshape((-1,)).view(np.uint32) + output_prefix = ( + out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape) + ) + trace_suffix = out_buf_flat[-trace_size_words:] + return output_prefix, trace_suffix + + +def write_out_trace(trace, file_name): + out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0) + with open(file_name, "w") as f: + f.write(out_str) + + +def execute(app, ifm_mem_fmt, total_wts): + app.buffers[2].write(ifm_mem_fmt) # input's standard format CYX | scalar YCX + app.buffers[3].write(total_wts) # wts's standard format OIYX | scalar OIYX + app.run() + return app.buffers[4].read()