diff --git a/programming_examples/basic/eltwise_exp/test.py b/programming_examples/basic/eltwise_exp/test.py
index cc132020d9..315b0c46c4 100644
--- a/programming_examples/basic/eltwise_exp/test.py
+++ b/programming_examples/basic/eltwise_exp/test.py
@@ -9,8 +9,7 @@
 import sys
 import time
 
-sys.path.append("../../programming_examples/utils")
-import test_utils
+import aie.utils.test as test_utils
 
 # ------------------------------------------------------
 # Configure this to match your design's buffer size
diff --git a/programming_examples/basic/eltwise_mul/test.py b/programming_examples/basic/eltwise_mul/test.py
index 6bb9636093..1e6028a504 100644
--- a/programming_examples/basic/eltwise_mul/test.py
+++ b/programming_examples/basic/eltwise_mul/test.py
@@ -11,8 +11,7 @@
 import sys
 import time
 
-sys.path.append("../../python")
-import test_utils
+import aie.utils.test as test_utils
 
 # ------------------------------------------------------
 # Configure this to match your design's buffer size
diff --git a/programming_examples/ml/eltwise_add/test.py b/programming_examples/ml/eltwise_add/test.py
index b53b00db20..ad433460f1 100644
--- a/programming_examples/ml/eltwise_add/test.py
+++ b/programming_examples/ml/eltwise_add/test.py
@@ -12,8 +12,7 @@
 import sys
 import time
 
-sys.path.append("../../programming_examples/utils")
-import test_utils
+import aie.utils.test as test_utils
 
 # ------------------------------------------------------
 # Configure this to match your design's buffer size
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index af11f43567..59d89d38a5 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,6 +26,15 @@ declare_mlir_python_sources(AIEPythonSources.Util
     util.py
 )
 
+declare_mlir_python_sources(AIEPythonSources.Utils
+  ADD_TO_PARENT AIEPythonSources
+  SOURCES
+    utils/test.py
+    utils/xrt.py
+    utils/ml.py
+    utils/trace.py
+)
+
 declare_mlir_python_sources(AIEPythonSources.Extras
   ADD_TO_PARENT AIEPythonSources
   SOURCES_GLOB
diff --git a/python/test_utils.py b/python/test_utils.py
deleted file mode 100644
index 5e187f6330..0000000000
--- a/python/test_utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# test_utils.py -*- Python -*-
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import argparse
-
-
-# options
-def parse_args(args):
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path"
-    )
-    p.add_argument(
-        "-k",
-        "--kernel",
-        required=True,
-        dest="kernel",
-        default="MLIR_AIE",
-        help="the kernel name in the XCLBIN (for instance MLIR_AIE)",
-    )
-    p.add_argument(
-        "-v", "--verbosity", default=0, type=int, help="the verbosity of the output"
-    )
-    p.add_argument(
-        "-i",
-        "--instr",
-        dest="instr",
-        default="instr.txt",
-        help="path of file containing userspace instructions sent to the NPU",
-    )
-    p.add_argument(
-        "--verify",
-        dest="verify",
-        default=True,
-        help="whether to verify the AIE computed output",
-    )
-    p.add_argument(
-        "--iters",
-        dest="iters",
-        default=1,
-        type=int,
-        help="number of benchmark iterations",
-    )
-    p.add_argument(
-        "--warmup",
-        dest="warmup_iters",
-        default=0,
-        type=int,
-        help="number of warmup iterations",
-    )
-    p.add_argument(
-        "-t",
-        "--trace_sz",
-        dest="trace_size",
-        default=0,
-        type=int,
-        help="trace size in bytes",
-    )
-    p.add_argument(
-        "--trace_file",
-        dest="trace_file",
-        default="trace.txt",
-        help="where to store trace output",
-    )
-    return p.parse_args(args)
diff --git a/python/utils/ml.py b/python/utils/ml.py
new file mode 100644
index 0000000000..527ced00e9
--- /dev/null
+++ b/python/utils/ml.py
@@ -0,0 +1,440 @@
+# ml.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+import os
+from torch.utils.data import Dataset
+
+# from PIL import Image
+import json
+import sys
+import csv
+import json
+import argparse
+import numpy as np
+
+# import cv2
+import numpy as np
+
+# from PIL import Image
+from pathlib import Path
+import torch
+import torch.nn as nn
+
+# from prettytable import PrettyTable
+import math
+
+
+# class ImageNetKaggle(Dataset):
+#     def __init__(self, root, split, transform=None):
+#         self.samples = []
+#         self.targets = []
+#         self.transform = transform
+#         self.syn_to_class = {}
+#         with open(os.path.join(root, "imagenet_class_index.json"), "rb") as f:
+#             json_file = json.load(f)
+#             for class_id, v in json_file.items():
+#                 self.syn_to_class[v[0]] = int(class_id)
+#         with open(os.path.join(root, "ILSVRC2012_val_labels.json"), "rb") as f:
+#             self.val_to_syn = json.load(f)
+#         samples_dir = os.path.join(root, "ILSVRC/Data/CLS-LOC", split)
+#         for entry in os.listdir(samples_dir):
+#             if split == "train":
+#                 syn_id = entry
+#                 target = self.syn_to_class[syn_id]
+#                 syn_folder = os.path.join(samples_dir, syn_id)
+#                 for sample in os.listdir(syn_folder):
+#                     sample_path = os.path.join(syn_folder, sample)
+#                     self.samples.append(sample_path)
+#                     self.targets.append(target)
+#             elif split == "val":
+#                 syn_id = self.val_to_syn[entry]
+#                 target = self.syn_to_class[syn_id]
+#                 sample_path = os.path.join(samples_dir, entry)
+#                 self.samples.append(sample_path)
+#                 self.targets.append(target)
+
+#     def __len__(self):
+#         return len(self.samples)
+
+#     def __getitem__(self, idx):
+#         x = Image.open(self.samples[idx]).convert("RGB")
+#         if self.transform:
+#             x = self.transform(x)
+#         return x, self.targets[idx]
+
+
+class CSVLogger:
+    def __init__(self, filename, sep=","):
+        self.filename = str(filename)
+        if os.path.exists(self.filename):
+            with open(self.filename) as f:
+                self.columns = csv.DictReader(f).fieldnames
+        else:
+            self.columns = None
+        self.fh = open(self.filename, "a", newline="")
+        self.csvwriter = csv.writer(self.fh, delimiter=sep)
+        self.count = 0
+
+    def set_columns(self, columns):
+        if self.columns:
+            raise Exception("Columns already set")
+        self.columns = list(columns)
+        self.csvwriter.writerow(self.columns)
+
+    def append(self, row):
+        if self.columns is None:
+            self.set_columns(row.keys())
+        self.csvwriter.writerow([row.get(k, "-") for k in self.columns])
+        self.count += 1
+        if self.count > 100:
+            self.count = 0
+            self.fh.flush()
+
+    def close(self):
+        self.fh.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+
+def load_class_label(class_label_file: str, num_classes: int) -> list:
+    class_label = json.load(open(class_label_file))
+    class_label_list = [class_label[str(i)] for i in range(num_classes)]
+
+    return class_label_list
+
+
+# def count_parameters(model):
+#     table = PrettyTable(["Modules", "Parameters"])
+#     total_params = 0
+#     for name, parameter in model.named_parameters():
+#         if not parameter.requires_grad:
+#             continue
+#         param = parameter.numel()
+#         table.add_row([name, param])
+#         total_params += param
+#     print(table)
+#     print(f"Total Trainable Params: {total_params}")
+#     return total_params
+
+
+def unpickle(file):
+    import pickle
+
+    with open(file, "rb") as fo:
+        dict = pickle.load(fo, encoding="latin1")
+    return dict
+
+
+# def extract_cifar():
+#     datafile = r"./data_torchvision/cifar-10-batches-py/test_batch"
+#     metafile = r"./data_torchvision/cifar-10-batches-py/batches.meta"
+
+#     data_batch_1 = unpickle(datafile)
+#     metadata = unpickle(metafile)
+
+#     images = data_batch_1["data"]
+#     labels = data_batch_1["labels"]
+#     images = np.reshape(images, (10000, 3, 32, 32))
+
+#     import os
+
+#     dirname = "cifar_images"
+#     if not os.path.exists(dirname):
+#         os.mkdir(dirname)
+
+#     # Extract and dump first 10 images
+#     for i in range(0, 100):
+#         im = images[i]
+#         im = im.transpose(1, 2, 0)
+#         im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+#         im_name = f"./cifar_images/image_{i}.png"
+#         cv2.imwrite(im_name, im)
+
+
+def fuse_single_conv_bn_pair(bn_mean, bn_var, bn_wts, bn_bias, conv_wts):
+    # https://github.com/ChoiDM/Pytorch_BN_Fold/blob/master/bn_fold.py
+    eps = 1e-05
+    mu = bn_mean
+    var = bn_var
+    gamma = bn_wts
+    # if 'bias' in bn_bias:
+    #     beta = bn_bias
+    # else:
+    #     beta = torch.zeros(gamma.size(0)).float()
+    beta = bn_bias
+    # Conv params
+    W = conv_wts
+
+    denom = torch.sqrt(var + eps)
+
+    A = gamma.div(denom)
+    # bias = torch.zeros(W.size(0)).float()
+    # b = beta - gamma.mul(mu).div(denom)
+    # bias *= A
+    A = A.expand_as(W.transpose(0, -1)).transpose(0, -1)
+    A = A.to(torch.int8)
+    W.mul_(A)
+    # bias.add_(b)
+
+    return W
+
+
+class DataShaper:
+    def __init__(self, defOrder="RC", print_info=False):
+        self.defOrder = defOrder
+        self.print_info = print_info
+        self.log_msg = []
+
+    def _reorder_granularity_range(
+        self, order, z, start_data_dim=-1, stop_str_dim=None
+    ):
+        if stop_str_dim is None:
+            stop_str_dim = len(order)
+        gran = 1
+        pre_group = {}
+        for idx, s in enumerate(order.split(z)[1:]):
+            step = ""
+            pg = ""
+            off = 0
+            if s.find(")") >= 0 and (s.find("(") > s.find(")") or s.find("(") < 0):
+                pg, s = s.split(")", 1)
+            for i, c in enumerate(s):
+                if c.isdigit():
+                    step += c
+                else:
+                    if c == ">":
+                        off = int(s[i + 1])
+                    if c == "<":
+                        off = -int(s[i + 1])
+                    break
+            if len(pg) > 0 and pg[0] in "<>":
+                if pg[0] == ">":
+                    off = int(pg[1])
+                if pg[0] == "<":
+                    off = -int(pg[1])
+                pg = pg[2:]
+            if step and idx + off > start_data_dim and idx <= stop_str_dim:
+                gran *= int(step)
+                for p in pg:
+                    if p.isdigit() or p in "<>":
+                        continue
+                    elif p in pre_group:
+                        pre_group[p] *= int(step)
+                    else:
+                        pre_group[p] = int(step)
+        return gran, pre_group
+
+    def _reorder_decode(self, shape, order, defOrder=None):
+        if not defOrder:
+            defOrder = self.defOrder
+        Ds = [order.count(c) for c in defOrder]
+        D = list(shape)
+        size = [0] * sum(Ds)
+        perm = [0] * sum(Ds)
+        pad_im = [0] * len(shape)
+        pad_ex = [0] * sum(Ds)
+        brdcst = [1] * sum(Ds)
+        align = [1] * sum(Ds)
+        val = ""
+        val_gi = ""
+        off = 0
+        group = False
+        d = [sum(Ds[0 : i + 1]) - 1 for i in range(len(Ds))]
+        p = sum(Ds) - 1
+        for z in reversed(order):
+            if z.isdigit():
+                if group:
+                    val_gi = z + val_gi
+                else:
+                    val = z + val
+            elif z == ">":
+                if group:
+                    off = int(val_gi)
+                    val_gi = ""
+                else:
+                    off = int(val)
+                    val = ""
+            elif z == "<":
+                if group:
+                    off = -int(val_gi)
+                    val_gi = ""
+                else:
+                    off = -int(val)
+                    val = ""
+            elif z == ")":
+                group = True
+            elif z == "(":
+                group = False
+                off = 0
+                val = ""
+                val_gi = ""
+            elif z == "%":  # Pad dimension by N
+                pad_ex[p + 1] += max(0, int(val) - 1) * (
+                    size[perm[p + 1]] + pad_ex[p + 1]
+                )
+                val = ""
+            elif z == "*":  # Broadcast dimension by N
+                brdcst[p] *= int(val)
+                val = ""
+            elif z == "|":  # Align data after a dimension to N
+                align[p] *= int(val)
+                val = ""
+            elif z in defOrder:
+                idx = defOrder.find(z)
+                perm[p] = d[idx] + off
+                if off < 0:
+                    start_dim = d[idx] + off - sum(Ds[0:idx]) if val else -1
+                    stop_dim = d[idx] - sum(Ds[0:idx])
+                    gran, pre_group = self._reorder_granularity_range(
+                        order, z, start_dim, stop_dim
+                    )
+                    for i, c in enumerate(defOrder):
+                        if c in pre_group:
+                            if D[i] >= pre_group[c]:
+                                gran //= pre_group[c]
+                            elif D[i] > 1:
+                                gran = int(math.ceil(1.0 * gran / D[i]))
+                    D_rem = max(1, D[idx] // gran)
+                else:
+                    D_rem = D[idx]
+                if val:
+                    vi = int(val)
+                    if group:
+                        if vi > D_rem:
+                            vi_rem = vi // D_rem
+                            vi //= vi_rem
+                            val = str(vi_rem)
+                        else:
+                            val = "1"
+                    else:
+                        val = ""
+                else:
+                    vi = D_rem
+                if vi > 0:
+                    if D[idx] % vi != 0:
+                        dim_sub = np.prod(
+                            np.maximum(1, size[sum(Ds[0:idx]) : sum(Ds[0 : idx + 1])])
+                        )
+                        pad_im[idx] += (vi - D[idx] % vi) * dim_sub
+                    size[d[idx] + off] = vi
+                    D[idx] = int(math.ceil(1.0 * D[idx] / vi))
+                if not group:
+                    off = 0
+                d[idx] -= 1
+                p -= 1
+        if self.print_info:
+            self.log_msg.append(
+                "[INFO]: reorder s={:<15} o={:<15} -> pi={:<15} s={:<30} p={:<30} pe={:<30}, b={:<30}, a={:<30}".format(
+                    *map(str, (shape, order, pad_im, size, perm, pad_ex, brdcst, align))
+                )
+            )
+        return pad_im, size, perm, pad_ex, brdcst, align
+
+    def reorder_mat(self, mat, order, defOrder=None, inverse=False):
+        pad_im, size, perm, pad_ex, brdcst, align = self._reorder_decode(
+            mat.shape, order, defOrder
+        )
+        if not inverse:
+            if sum(pad_im) > 0:
+                mat = np.pad(mat, tuple(zip([0] * len(pad_im), pad_im)), "constant")
+            mat = mat.reshape(*size).transpose(perm)
+            if sum(pad_ex) > 0:
+                mat = np.pad(mat, tuple(zip([0] * len(pad_ex), pad_ex)), "constant")
+            if np.prod(brdcst) > 1:
+                for idx, b in enumerate(brdcst):
+                    if b > 1:
+                        mat = np.repeat(mat, b, axis=idx)
+            if np.prod(align) > 1:
+                for idx, a in reversed(tuple(enumerate(align))):
+                    if a > 1:
+                        mat = mat.reshape(mat.shape[: idx + 1] + (-1,))
+                        pad = a - (mat.shape[-1] % a)
+                        if pad < a:
+                            mp = np.zeros((len(mat.shape), 2), dtype=np.int)
+                            mp[-1, -1] = pad
+                            mat = np.pad(mat, mp, "constant")
+        else:
+            assert sum(pad_im) == 0, "Reverse of implicit padding not supported"
+            assert sum(pad_ex) == 0, "Reverse of explicit padding not supported"
+            assert np.prod(brdcst) == 1, "Reverse of broadcasting not supported"
+            assert np.prod(align) == 1, "Reverse of alignment not supported"
+            perm_inv = [perm.index(p) for p in range(len(perm))]
+            size_inv = [size[p] for p in perm]
+            mat = mat.reshape(*size_inv)
+            mat = mat.transpose(perm_inv)
+
+        return mat.reshape(-1)
+
+    def get_dim_steps(
+        self, shape, order, defOrder=None, bits=8, ebs=None, sparse_ratio=1
+    ):
+        pad_im, size, perm, pad_ex, brdcst, align = self._reorder_decode(
+            shape, order, defOrder
+        )
+        sz = 1
+        d = len(shape) - 1
+        sp = len(perm) - 1
+        dim = [0] * len(shape)
+        for i, s in enumerate(reversed(size)):
+            sz *= s
+            if sz >= shape[d] + pad_im[d]:
+                # current dimension contains all elements
+                p = len(perm) - 1 - i
+                dim[d] = pi0 = perm.index(p)
+                if p + 1 < sp and p + 1 in perm:
+                    pi1 = perm.index(p + 1)
+                    if pi0 + 1 == pi1:
+                        # Found XX coupling
+                        dim[d] = pi1
+                        self.log_msg.append(
+                            "INFO: Found XX coupling (order={}, size={}, perm={}, p={})".format(
+                                order, size, perm, p
+                            )
+                        )
+                    elif len(perm) > pi0 + 1:
+                        pb = perm[pi0 + 1]
+                        if pi0 + 2 == pi1 and size[pb] == 1:
+                            # Found XNX sequence with N=1, simplify
+                            dim[d] = pi1
+                            self.log_msg.append(
+                                "INFO: Found XNX sequency with N=1, simplify (order={}, size={}, perm={}, p={})".format(
+                                    order, size, perm, p
+                                )
+                            )
+                sz = 1
+                d -= 1
+                sp = p
+        # dim = [perm.index(p) for p in dim]
+        size_inv = (np.array(size)[perm] + pad_ex) * brdcst
+        idx = -2 if bits == 4 and size_inv[-1] == 2 else -1
+        if ebs or sparse_ratio:
+            assert (
+                size_inv[idx] >= 8
+            ), "Sparse/exponent block is too small. Data (order) unexpected or update to script is required"
+        size_inv[idx] = int(
+            size_inv[idx] * sparse_ratio * (bits - (8 if ebs else 0)) / 8
+        ) + (size_inv[idx] // ebs if ebs else 0)
+        step = [0] * (len(shape) + 1)
+        cur = 1
+        for i_rev, (s, al) in enumerate(reversed(tuple(zip(size_inv, align)))):
+            i = len(perm) - 1 - i_rev
+            if al > 1:
+                cur = ((cur + al - 1) // al) * al
+            if i in dim:
+                step[dim.index(i)] = cur
+            cur *= s
+        step[-1] = cur
+        return step
+
+
+if __name__ == "__main__":
+    extract_cifar()
diff --git a/python/utils/test.py b/python/utils/test.py
new file mode 100644
index 0000000000..81c3e51e59
--- /dev/null
+++ b/python/utils/test.py
@@ -0,0 +1,168 @@
+# ml.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+import argparse
+import pyxrt as xrt
+
+
+# options
+def parse_args(args):
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path"
+    )
+    p.add_argument(
+        "-k",
+        "--kernel",
+        required=True,
+        dest="kernel",
+        default="MLIR_AIE",
+        help="the kernel name in the XCLBIN (for instance MLIR_AIE)",
+    )
+    p.add_argument(
+        "-v", "--verbosity", default=0, type=int, help="the verbosity of the output"
+    )
+    p.add_argument(
+        "-i",
+        "--instr",
+        dest="instr",
+        default="instr.txt",
+        help="path of file containing userspace instructions sent to the NPU",
+    )
+    p.add_argument(
+        "--verify",
+        dest="verify",
+        default=True,
+        help="whether to verify the AIE computed output",
+    )
+    p.add_argument(
+        "--iters",
+        dest="iters",
+        default=1,
+        type=int,
+        help="number of benchmark iterations",
+    )
+    p.add_argument(
+        "--warmup",
+        dest="warmup_iters",
+        default=0,
+        type=int,
+        help="number of warmup iterations",
+    )
+    p.add_argument(
+        "-t",
+        "--trace_sz",
+        dest="trace_size",
+        default=0,
+        type=int,
+        help="trace size in bytes",
+    )
+    p.add_argument(
+        "--trace_file",
+        dest="trace_file",
+        default="trace.txt",
+        help="where to store trace output",
+    )
+    return p.parse_args(args)
+
+
+# Add default args to standard parser object
+def create_default_argparser():
+    p = argparse.ArgumentParser()
+    p.add_argument(
+        "-x", "--xclbin", required=True, dest="xclbin", help="the input xclbin path"
+    )
+    p.add_argument(
+        "-k",
+        "--kernel",
+        required=True,
+        dest="kernel",
+        default="MLIR_AIE",
+        help="the kernel name in the XCLBIN (for instance MLIR_AIE)",
+    )
+    p.add_argument(
+        "-v", "--verbosity", default=0, type=int, help="the verbosity of the output"
+    )
+    p.add_argument(
+        "-i",
+        "--instr",
+        dest="instr",
+        default="instr.txt",
+        help="path of file containing userspace instructions sent to the NPU",
+    )
+    p.add_argument(
+        "--verify",
+        dest="verify",
+        default=True,
+        help="whether to verify the AIE computed output",
+    )
+    p.add_argument(
+        "--iters",
+        dest="iters",
+        default=1,
+        type=int,
+        help="number of benchmark iterations",
+    )
+    p.add_argument(
+        "--warmup",
+        dest="warmup_iters",
+        default=0,
+        type=int,
+        help="number of warmup iterations",
+    )
+    p.add_argument(
+        "-t",
+        "--trace_sz",
+        dest="trace_size",
+        default=0,
+        type=int,
+        help="trace size in bytes",
+    )
+    p.add_argument(
+        "--trace_file",
+        dest="trace_file",
+        default="trace.txt",
+        help="where to store trace output",
+    )
+    return p
+
+
+def write_out_trace(trace_buffer, trace_size, trace_file):
+    try:
+        with open(trace_file, "wt") as f:
+            f.write(trace_buffer)
+    except Exception as e:
+        print(e)
+        sys.exit(1)
+
+
+def init_xrt_load_kernel(opts):
+    # Get a device handle
+    device = xrt.device(0)
+
+    # Load the xclbin
+    xclbin = xrt.xclbin(opts.xclbin)
+
+    # Load the kernel
+    kernels = xclbin.get_kernels()
+    try:
+        xkernel = [k for k in kernels if opts.kernel in k.get_name()][0]
+    except:
+        print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'")
+        exit(-1)
+
+    # Register xclbin
+    device.register_xclbin(xclbin)
+
+    # Get a hardware context
+    context = xrt.hw_context(device, xclbin.get_uuid())
+
+    # get a kernel handle
+    kernel = xrt.kernel(context, xkernel.get_name())
+
+    return (device, kernel)
diff --git a/python/utils/trace.py b/python/utils/trace.py
new file mode 100644
index 0000000000..8c3e97be87
--- /dev/null
+++ b/python/utils/trace.py
@@ -0,0 +1,186 @@
+# trace.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+from aie.dialects.aiex import *
+
+
+def extract_trace(out_buf, out_buf_shape, out_buf_dtype, trace_size):
+    trace_size_words = trace_size // 4
+    out_buf_flat = out_buf.reshape((-1,)).view(np.uint32)
+    output_prefix = (
+        out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape)
+    )
+    trace_suffix = out_buf_flat[-trace_size_words:]
+    return output_prefix, trace_suffix
+
+
+def write_out_trace(trace, file_name):
+    out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
+    with open(file_name, "w") as f:
+        f.write(out_str)
+
+
+# trace_utils.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+from aie.dialects.aiex import *
+
+
+def pack4bytes(b3, b2, b1, b0):
+    w = (b3 & 0xFF) << 24
+    w |= (b2 & 0xFF) << 16
+    w |= (b1 & 0xFF) << 8
+    w |= (b0 & 0xFF) << 0
+    return w
+
+
+# Configure tracing, see https://github.com/Xilinx/mlir-aie/blob/resnet/docs/Tracing.md
+# This is a very simple model of tracing, which has some big assumptions:
+# 1) Trace data is collected over circuit switched connections, not packet-switched
+# 2) A ShimDMA S2MM channel is dedicated to the trace data
+# 3) Trace data is small enough to fit in a fixed-size buffer, which is collected with the
+# outputs of the design
+# 4) The usual model of '2 inputs, 1 output' is followed, and the
+# trace data is appended to the other outputs
+
+# tile: The tile we're tracing
+# shim: The shim tile to output data with.
+# bd_id: The BD in the shim tile to use.
+# channel: The S2MM channel to use (0 or 1).
+# size: The size of the trace data
+# offset: The offset of the trace data in the (single) output buffer.
+# start: The event number to start tracing on
+# stop: The event number to stop tracing on
+# events: A list of events to trace.  Up to 8 events are allowed in aie2, more are ignored
+
+# Some events:
+# TRUE                       (0x01)
+# STREAM_STALL               (0x18)
+# LOCK_STALL                 (0x1A)
+# EVENTS_CORE_INSTR_EVENT_1  (0x22)
+# EVENTS_CORE_INSTR_EVENT_0  (0x21)
+# INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+# INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock acquire instruction
+# INSTR_LOCK_RELEASE_REQ     (0x2D)  Core executes a lock release instruction
+# EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+# EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+
+# Event numbers should be less than 128.
+# Big assumption: The bd_id and channel are unused.  If they are used by something else, then
+# everything will probably break.
+def configure_simple_tracing_aie2(
+    tile, shim, channel, bd_id, ddr_id, size, offset, start, stop, events
+):
+    # Shim has to be a... shim.  Also needs to be a NOC tile, but we don't have
+    # an easy way of checking that through python.
+    assert int(shim.row) == 0
+
+    # Pad the input so we have exactly 8 events.
+    events = (events + [0] * 8)[:8]
+
+    # 0x340D0: Trace Control 0
+    #          0xAABB---C
+    #            AA        <- Event to stop trace capture
+    #              BB      <- Event to start trace capture
+    #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
+    # Configure so that "Event 1" (always true) causes tracing to start
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x340D0,
+        value=pack4bytes(stop, start, 0, 0),
+    )
+    # 0x340D4: Trace Control 1
+    # This is used to control packet routing.  For the moment
+    # only deal with the simple case of circuit routing.
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x340D4,
+        value=0,
+    )
+    # 0x340E0: Trace Event Group 1  (Which events to trace)
+    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x340E0,
+        value=pack4bytes(*events[0:4]),
+    )
+    # 0x340E4: Trace Event Group 2  (Which events to trace)
+    #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x340E4,
+        value=pack4bytes(*events[4:8]),
+    )
+
+    # 0x3FF00: Stream switch event port selection 0
+    def master(port):
+        return port | (1 << 5)
+
+    def slave(port):
+        return port
+
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x3FF00,
+        value=pack4bytes(0, 0, slave(1), master(1)),  # port 1 is FIFO0?
+    )
+    ipu_write32(
+        column=int(tile.col),
+        row=int(tile.row),
+        address=0x3FF04,
+        value=pack4bytes(0, 0, 0, 0),
+    )
+
+    # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
+    # out to host DDR memory
+    ipu_writebd_shimtile(
+        bd_id=bd_id,
+        buffer_length=size,
+        buffer_offset=offset,
+        enable_packet=0,
+        out_of_order_id=0,
+        packet_id=0,
+        packet_type=0,
+        column=int(shim.col),
+        column_num=1,
+        d0_size=0,
+        d0_stride=0,
+        d1_size=0,
+        d1_stride=0,
+        d2_stride=0,
+        ddr_id=ddr_id,
+        iteration_current=0,
+        iteration_size=0,
+        iteration_stride=0,
+        lock_acq_enable=0,
+        lock_acq_id=0,
+        lock_acq_val=0,
+        lock_rel_id=0,
+        lock_rel_val=0,
+        next_bd=0,
+        use_next_bd=0,
+        valid_bd=1,
+    )
+    # configure S2MM channel
+    ipu_write32(
+        column=int(shim.col),
+        row=int(shim.row),
+        address=0x1D204 if channel == 0 else 0x1D20C,
+        value=bd_id,
+    )
diff --git a/python/utils/xrt.py b/python/utils/xrt.py
new file mode 100644
index 0000000000..fa36ff096a
--- /dev/null
+++ b/python/utils/xrt.py
@@ -0,0 +1,162 @@
+# ml.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+# from npu.runtime
+import pyxrt as xrt
+
+# import npu.runtime as xrt
+import numpy as np
+
+
+class AIE_Application:
+
+    def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"):
+        self.device = None
+        self.kernel = None
+        self.buffers = [None] * 8
+        self.device = xrt.device(0)
+
+        # Find kernel by name in the xclbin
+        self.xclbin = xrt.xclbin(xclbin_path)
+        kernels = self.xclbin.get_kernels()
+        try:
+            xkernel = [k for k in kernels if kernel_name == k.get_name()][0]
+        except KeyError:
+            raise AIE_Application_Error("No such kernel: " + kernel_name)
+        self.device.register_xclbin(self.xclbin)
+        self.context = xrt.hw_context(self.device, self.xclbin.get_uuid())
+        self.kernel = xrt.kernel(self.context, xkernel.get_name())
+
+        ## Set up instruction stream
+        insts = read_insts(insts_path)
+        self.n_insts = len(insts)
+        self.insts_buffer = AIE_Buffer(
+            self, 0, insts.dtype, insts.shape, xrt.bo.cacheable
+        )
+        self.insts_buffer.write(insts)
+
+    def register_buffer(self, group_id, *args, **kwargs):
+        self.buffers[group_id] = AIE_Buffer(self, group_id, *args, **kwargs)
+
+    def run(self):
+        self.insts_buffer.sync_to_device()
+        h = self.call()
+        h.wait()
+
+    def call(self):
+        h = self.kernel(
+            self.insts_buffer.bo,
+            self.n_insts * 4,
+            *[b.bo for b in self.buffers if b is not None],
+        )
+        return h
+
+    def __del__(self):
+        del self.kernel
+        del self.device
+
+
+class AIE_Buffer:
+
+    def __init__(self, application, group_id, dtype, shape, flags=xrt.bo.host_only):
+        self.application = application
+        self.dtype = dtype
+        self.shape = shape
+        self.len_bytes = np.prod(shape) * np.dtype(dtype).itemsize
+        self.bo = xrt.bo(
+            application.device,
+            self.len_bytes,
+            flags,
+            application.kernel.group_id(group_id),
+        )
+
+    def read(self):
+        self.sync_from_device()
+        return self.bo.read(self.len_bytes, 0).view(self.dtype).reshape(self.shape)
+
+    def write(self, v, offset=0):
+        self.bo.write(v.view(np.uint8), offset)
+        self.sync_to_device()
+
+    def sync_to_device(self):
+        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+    def sync_from_device(self):
+        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+    def __del__(self):
+        del self.bo
+        self.bo = None
+
+
+class AIE_Application_Error(Exception):
+    pass
+
+
+insts_cache = {}
+
+
+def read_insts(insts_path):
+    global insts_cache
+    if insts_path in insts_cache:
+        # Speed up things if we re-configure the array a lot: Don't re-parse
+        # the insts.txt each time
+        return insts_cache[insts_path]
+    with open(insts_path, "r") as f:
+        insts_text = f.readlines()
+        insts_text = [l for l in insts_text if l != ""]
+        insts_v = np.array([int(c, 16) for c in insts_text], dtype=np.uint32)
+        insts_cache[insts_path] = insts_v
+    return insts_v
+
+
+def setup_aie(
+    xclbin_path,
+    insts_path,
+    in_0_shape,
+    in_0_dtype,
+    in_1_shape,
+    in_1_dtype,
+    out_buf_shape,
+    out_buf_dtype,
+    enable_trace=False,
+    kernel_name="MLIR_AIE",
+    trace_size=16384,
+):
+    app = AIE_Application(xclbin_path, insts_path, kernel_name)
+    app.register_buffer(2, shape=in_0_shape, dtype=in_0_dtype)
+    app.register_buffer(3, shape=in_1_shape, dtype=in_1_dtype)
+    if enable_trace:
+        out_buf_len_bytes = np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize
+        out_buf_shape = (out_buf_len_bytes + trace_size,)
+        out_buf_dtype = np.uint8
+    app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype)
+    return app
+
+
+def extract_trace(out_buf, out_buf_shape, out_buf_dtype, trace_size):
+    trace_size_words = trace_size // 4
+    out_buf_flat = out_buf.reshape((-1,)).view(np.uint32)
+    output_prefix = (
+        out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape)
+    )
+    trace_suffix = out_buf_flat[-trace_size_words:]
+    return output_prefix, trace_suffix
+
+
+def write_out_trace(trace, file_name):
+    out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
+    with open(file_name, "w") as f:
+        f.write(out_str)
+
+
+def execute(app, ifm_mem_fmt, total_wts):
+    app.buffers[2].write(ifm_mem_fmt)  # input's standard format CYX | scalar YCX
+    app.buffers[3].write(total_wts)  # wts's standard format OIYX | scalar OIYX
+    app.run()
+    return app.buffers[4].read()