diff --git a/aie_kernels/aie2/add.cc b/aie_kernels/aie2/add.cc
new file mode 100755
index 0000000000..75a0552ec8
--- /dev/null
+++ b/aie_kernels/aie2/add.cc
@@ -0,0 +1,61 @@
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, const int N>
+void eltwise_add(T_in *a, T_in *b, T_out *c) {
+  for (int i = 0; i < N; i++) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <typename T_in, typename T_out, const int N>
+void eltwise_vadd(T_in *a, T_in *b, T_out *c) {
+
+  constexpr int vec_factor = 16;
+  event0();
+  T_in *__restrict pA1 = a;
+  T_in *__restrict pB1 = b;
+  T_out *__restrict pC1 = c;
+  const int F = N / vec_factor;
+  for (int i = 0; i < F; i++)
+    chess_prepare_for_pipelining chess_loop_range(16, ) {
+      aie::vector<T_in, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+      pA1 += vec_factor;
+      aie::vector<T_in, vec_factor> B0 = aie::load_v<vec_factor>(pB1);
+      pB1 += vec_factor;
+      aie::vector<T_out, vec_factor> cout = aie::add(A0, B0);
+      aie::store_v(pC1, cout);
+      pC1 += vec_factor;
+    }
+  event1();
+}
+
+extern "C" {
+
+void eltwise_add_bf16_scalar(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) {
+  eltwise_add<bfloat16, bfloat16, 1024>(a_in, b_in, c_out);
+}
+
+void eltwise_add_bf16_vector(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out) {
+  eltwise_vadd<bfloat16, bfloat16, 1024>(a_in, b_in, c_out);
+}
+
+} // extern "C"
diff --git a/programming_examples/vision/vision_kernels/passThrough.cc b/aie_kernels/aie_generic/passThrough.cc
similarity index 84%
rename from programming_examples/vision/vision_kernels/passThrough.cc
rename to aie_kernels/aie_generic/passThrough.cc
index 0928af33f0..1de33066d1 100644
--- a/programming_examples/vision/vision_kernels/passThrough.cc
+++ b/aie_kernels/aie_generic/passThrough.cc
@@ -15,30 +15,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
 __attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
                                                const int32_t height,
                                                const int32_t width) {
-  //::aie::vector<T, N> data_out;
-  //::aie::mask<N> temp_val;
+  event0();
+
   v64uint8 *restrict outPtr = (v64uint8 *)out;
   v64uint8 *restrict inPtr = (v64uint8 *)in;
 
   for (int j = 0; j < (height * width); j += N) // Nx samples per loop
-    chess_prepare_for_pipelining chess_loop_range(6, ) {
-      //::aie::vector<T, N> tmpVector = ::aie::load_v(in);
-      //::aie::store_v(out, tmpVector);
-
-      *outPtr++ = *inPtr++;
+    chess_prepare_for_pipelining chess_loop_range(6, ) { *outPtr++ = *inPtr++; }
 
-      // in += N;
-      // out += N;
-    }
+  event1();
 }
 
 extern "C" {
diff --git a/aie_kernels/generic/vector_max.cc b/aie_kernels/generic/vector_max.cc
new file mode 100644
index 0000000000..f94646ff0c
--- /dev/null
+++ b/aie_kernels/generic/vector_max.cc
@@ -0,0 +1,53 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+void vector(int32_t *restrict in, int32_t *restrict out) {
+
+  v16int32 tiny = broadcast_to_v16int32((int32_t)-2147483648);
+  int32_t input_size = 1024;
+  int32_t vector_size = 16;
+  v16int32 after_vector;
+  v16int32 running_max = tiny;
+  for (int32_t i = 0; i < input_size; i += vector_size)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16int32 next = *(v16int32 *)(in + i);
+      v16int32 test = max(running_max, next);
+      running_max = test;
+    }
+  after_vector = running_max;
+  v16int32 first = shift_bytes(after_vector, after_vector, 32);
+  v16int32 second = max(after_vector, first);
+  v16int32 second_shift = shift_bytes(second, second, 16);
+  v16int32 third = max(second, second_shift);
+  v16int32 third_shift = shift_bytes(third, third, 8);
+  v16int32 fourth = max(third, third_shift);
+  v16int32 fourth_shift = shift_bytes(fourth, fourth, 4);
+  v16int32 fifth = max(fourth, fourth_shift);
+  int32_t last = extract_elem(fifth, 0);
+  *(int32_t *)out = last;
+  return;
+}
+
+void scalar(int32_t *restrict in, int32_t *restrict out) {
+  size_t input_size = 1024;
+  int32_t running_max = (int32_t)-2147483648;
+  for (int32_t i = 0; i < input_size; i++) {
+    if (in[i] > running_max)
+      running_max = in[i];
+  }
+  *(int32_t *)out = running_max;
+
+  return;
+}
+
+extern "C" {
+
+void vector_max(int32_t *a_in, int32_t *c_out) { vector(a_in, c_out); }
+
+void scalar_max(int32_t *a_in, int32_t *c_out) { scalar(a_in, c_out); }
+
+} // extern "C"
diff --git a/aie_kernels/generic/vector_min.cc b/aie_kernels/generic/vector_min.cc
new file mode 100644
index 0000000000..c60ca574a1
--- /dev/null
+++ b/aie_kernels/generic/vector_min.cc
@@ -0,0 +1,53 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+void vector(int32_t *restrict in, int32_t *restrict out) {
+
+  v16int32 massive = broadcast_to_v16int32((int32_t)2147483647);
+  int32_t input_size = 1024;
+  int32_t vector_size = 16;
+  v16int32 after_vector;
+  v16int32 running_min = massive;
+  for (int32_t i = 0; i < input_size; i += vector_size)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16int32 next = *(v16int32 *)(in + i);
+      v16int32 test = min(running_min, next);
+      running_min = test;
+    }
+  after_vector = running_min;
+  v16int32 first = shift_bytes(after_vector, after_vector, 32);
+  v16int32 second = min(after_vector, first);
+  v16int32 second_shift = shift_bytes(second, second, 16);
+  v16int32 third = min(second, second_shift);
+  v16int32 third_shift = shift_bytes(third, third, 8);
+  v16int32 fourth = min(third, third_shift);
+  v16int32 fourth_shift = shift_bytes(fourth, fourth, 4);
+  v16int32 fifth = min(fourth, fourth_shift);
+  int32_t last = extract_elem(fifth, 0);
+  *(int32_t *)out = last;
+  return;
+}
+
+void scalar(int32_t *restrict in, int32_t *restrict out) {
+  size_t input_size = 1024;
+  int32_t running_min = (int32_t)2147483647;
+  for (int32_t i = 0; i < input_size; i++) {
+    if (in[i] < running_min)
+      running_min = in[i];
+  }
+  *(int32_t *)out = running_min;
+
+  return;
+}
+
+extern "C" {
+
+void vector_min(int32_t *a_in, int32_t *c_out) { vector(a_in, c_out); }
+
+void scalar_min(int32_t *a_in, int32_t *c_out) { scalar(a_in, c_out); }
+
+} // extern "C"
diff --git a/aie_kernels/relu.cc b/aie_kernels/relu.cc
new file mode 100644
index 0000000000..a2e87cffc4
--- /dev/null
+++ b/aie_kernels/relu.cc
@@ -0,0 +1,41 @@
+//===- scale.cc -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#include <aie_api/aie.hpp>
+
+void relu(bfloat16 *restrict a, bfloat16 *restrict c, const int TILE_SIZE) {
+  const int v_factor = 32;
+  v32bfloat16 zeroes = broadcast_zero_bfloat16();
+
+  event0();
+  for (size_t i = 0; i < TILE_SIZE; i += v_factor)
+    chess_prepare_for_pipelining chess_loop_range(32, 32) {
+      v32bfloat16 input = *(v32bfloat16 *)(a + i);
+      v32bfloat16 output = max(input, zeroes);
+      *(v32bfloat16 *)(c + i) = output;
+    }
+  event1();
+  return;
+}
+
+extern "C" {
+
+void bf16_relu(bfloat16 *a_in, bfloat16 *c_out) { relu(a_in, c_out, 1024); }
+
+} // extern "C"
diff --git a/programming_examples/basic/eltwise_exp/Makefile b/programming_examples/basic/eltwise_exp/Makefile
new file mode 100644
index 0000000000..167ac5d741
--- /dev/null
+++ b/programming_examples/basic/eltwise_exp/Makefile
@@ -0,0 +1,53 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+all: build/final.xclbin
+
+targetname = eltwise_exp
+
+build/lut_based_ops.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+
+build/exp.o: 
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I ../../../../aie_runtime_lib/AIE2 -c ../../../../aie_kernels/aie2/bf16_exp.cc -o ${@F}
+
+build/kernels.a: build/exp.o build/lut_based_ops.o
+	ar rvs $@ $+
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/final.xclbin: build/aie.mlir build/kernels.a
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} 
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/eltwise_mul/CMakeLists.txt b/programming_examples/basic/eltwise_mul/CMakeLists.txt
index c64f84842b..c4ca0825d4 100644
--- a/programming_examples/basic/eltwise_mul/CMakeLists.txt
+++ b/programming_examples/basic/eltwise_mul/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,7 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
-    ../../../programming_examples/utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/eltwise_mul/Makefile b/programming_examples/basic/eltwise_mul/Makefile
index 363f12c4e4..1ad0072822 100644
--- a/programming_examples/basic/eltwise_mul/Makefile
+++ b/programming_examples/basic/eltwise_mul/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../../../programming_examples/basic/makefile-common
+include ../../makefile-common
 
 all: build/final.xclbin
 
@@ -14,7 +14,7 @@ targetname = myEltwiseMul
 
 build/mul.o:
 	mkdir -p ${@D}
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/mul.cc -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/mul.cc -o ${@F}
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
@@ -28,8 +28,7 @@ build/final.xclbin: build/aie.mlir build/mul.o
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/basic/eltwise_mul/test.py b/programming_examples/basic/eltwise_mul/test.py
index cc132020d9..9d22bcf9dc 100644
--- a/programming_examples/basic/eltwise_mul/test.py
+++ b/programming_examples/basic/eltwise_mul/test.py
@@ -9,7 +9,7 @@
 import sys
 import time
 
-sys.path.append("../../programming_examples/utils")
+sys.path.append("../../python")
 import test_utils
 
 # ------------------------------------------------------
diff --git a/programming_examples/basic/log_hello_world/CMakeLists.txt b/programming_examples/basic/log_hello_world/CMakeLists.txt
index d9f511062f..c4ca0825d4 100755
--- a/programming_examples/basic/log_hello_world/CMakeLists.txt
+++ b/programming_examples/basic/log_hello_world/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/log_hello_world/Makefile b/programming_examples/basic/log_hello_world/Makefile
index bc7292c90a..c5bcd8d5c3 100755
--- a/programming_examples/basic/log_hello_world/Makefile
+++ b/programming_examples/basic/log_hello_world/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 all: hello_world_elfstrings.csv
 
diff --git a/programming_examples/basic/matrix_multiplication/CMakeLists.txt b/programming_examples/basic/matrix_multiplication/CMakeLists.txt
index 4ad848b7a5..dfe345e188 100644
--- a/programming_examples/basic/matrix_multiplication/CMakeLists.txt
+++ b/programming_examples/basic/matrix_multiplication/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -42,6 +47,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     ${subdir}/test.cpp
 )
 
@@ -50,6 +56,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 44bf19e3e3..fd6a438ea0 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -20,17 +20,21 @@
 # Since the targets are all the same for all designs, they are defined here.
 # Subdirectories need only include this makefile-common after defining the 
 # following variables:
-# - subdir      -- subdirectory you are including this from
+# - subdir	  -- subdirectory you are including this from
 # - targetname  -- resulting test host code will be named targetname.exe
-# - kernels     -- which kernels in kernels folder to compile and link in
-#                  (without file extension)
-# - M, K, N     -- (optional) dimensions of matrices, may be used by design;
-#                  N=1 for matrix-vector
+# - kernels	 -- which kernels in kernels folder to compile and link in
+#					  (without file extension)
+# - M, K, N	 -- (optional) dimensions of matrices, may be used by design;
+#					  N=1 for matrix-vector
 
-include ../../makefile-common
+#include ${CURDIR}/../../makefile-common
+current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+#include ${current_dir}../../makefile-common
+SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
+include ${SELF_DIR}../../makefile-common
 
 # defaults; overwrite if needed
-M?=512   
+M?=512	
 K?=512
 N?=512
 
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
index 719291732f..c701ce9a50 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/Makefile
@@ -16,4 +16,6 @@ M=288
 K=288
 N=1
 
-include ../makefile-common
\ No newline at end of file
+SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
+include ${SELF_DIR}../makefile-common
+
diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile
index 231d94b2bc..ca92224df8 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/Makefile
+++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile
@@ -16,5 +16,6 @@ M=256
 K=256
 N=256
 
-include ../makefile-common
+SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
+include ${SELF_DIR}../makefile-common
 
diff --git a/programming_examples/basic/passthrough_dmas/CMakeLists.txt b/programming_examples/basic/passthrough_dmas/CMakeLists.txt
new file mode 100644
index 0000000000..3986c4a075
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/passthrough_hardware/Makefile b/programming_examples/basic/passthrough_dmas/Makefile
old mode 100755
new mode 100644
similarity index 93%
rename from programming_examples/basic/passthrough_hardware/Makefile
rename to programming_examples/basic/passthrough_dmas/Makefile
index 75147e0d60..13a8d42aae
--- a/programming_examples/basic/passthrough_hardware/Makefile
+++ b/programming_examples/basic/passthrough_dmas/Makefile
@@ -6,19 +6,17 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
 SHELL := /bin/bash
 
-targetname = passThroughHardware
-devicename = ipu
-col = 0
-LENGTH ?= 4096
-
 all: build/final.xclbin build/insts.txt
 
+targetname = passThroughDMAs
+LENGTH ?= 4096
+
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< ${devicename} ${col} ${LENGTH} ${LENGTH} > $@
@@ -71,4 +69,4 @@ vck5000: build/aie.mlir
 
 
 clean:
-	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
+	rm -rf build _build inst ${targetname}.exe
diff --git a/programming_examples/basic/passthrough_hardware/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py
similarity index 92%
rename from programming_examples/basic/passthrough_hardware/aie2.py
rename to programming_examples/basic/passthrough_dmas/aie2.py
index b2f30745b9..1f4b4882ba 100755
--- a/programming_examples/basic/passthrough_hardware/aie2.py
+++ b/programming_examples/basic/passthrough_dmas/aie2.py
@@ -14,7 +14,6 @@
 from aie.extras.context import mlir_mod_ctx
 
 N = 4096
-N_in_bytes = N * 4
 
  
 # Deciphering the command line arguments 
@@ -54,9 +53,8 @@ def device_body():
             # Compute tile 2
             @core(ComputeTile2)
             def core_body():
-                tmp = memref.alloc(1, T.i32())
-                v0 = arith.constant(0, T.i32())
-                memref.store(v0, tmp, [0])
+                for _ in for_(sys.maxsize):
+                    yield_([])
 
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
diff --git a/programming_examples/basic/passthrough_hardware/run.lit b/programming_examples/basic/passthrough_dmas/run.lit
similarity index 94%
rename from programming_examples/basic/passthrough_hardware/run.lit
rename to programming_examples/basic/passthrough_dmas/run.lit
index 62d66040ff..a4f5d568b6 100644
--- a/programming_examples/basic/passthrough_hardware/run.lit
+++ b/programming_examples/basic/passthrough_dmas/run.lit
@@ -1,7 +1,7 @@
 // (c) Copyright 2023 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// REQUIRES: ryzen_ai, chess
+// REQUIRES: ryzen_ai
 //
 // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
 // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
diff --git a/programming_examples/basic/passthrough_hardware/run_vck5000.lit b/programming_examples/basic/passthrough_dmas/run_vck5000.lit
similarity index 100%
rename from programming_examples/basic/passthrough_hardware/run_vck5000.lit
rename to programming_examples/basic/passthrough_dmas/run_vck5000.lit
diff --git a/programming_examples/basic/passthrough_hardware/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp
similarity index 100%
rename from programming_examples/basic/passthrough_hardware/test.cpp
rename to programming_examples/basic/passthrough_dmas/test.cpp
diff --git a/programming_examples/basic/passthrough_hardware/test_vck5000.cpp b/programming_examples/basic/passthrough_dmas/test_vck5000.cpp
similarity index 100%
rename from programming_examples/basic/passthrough_hardware/test_vck5000.cpp
rename to programming_examples/basic/passthrough_dmas/test_vck5000.cpp
diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
new file mode 100644
index 0000000000..8eba23849f
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
@@ -0,0 +1,80 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Xilinx Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif ()
+
+set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+        PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+        DISABLE_ABI_CHECK=1 
+        )
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
new file mode 100644
index 0000000000..fe7daabd4b
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -0,0 +1,49 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+VPATH := ../../../aie_kernels/aie_generic
+
+PASSTHROUGH_SIZE = 4096
+
+targetname = passThroughKernel
+
+.PHONY: all template clean
+
+all: build/final_${PASSTHROUGH_SIZE}.xclbin
+
+build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${PASSTHROUGH_SIZE} > $@
+
+build/passThrough.cc.o: passThrough.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+	
+build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final_${PASSTHROUGH_SIZE}.xclbin build/insts.txt
+	${powershell} ./$< -x build/final_${PASSTHROUGH_SIZE}.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
new file mode 100644
index 0000000000..b401f5801f
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -0,0 +1,170 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+N = 1024
+
+if len(sys.argv) == 2:
+    N = int(sys.argv[1])
+
+lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
+lineWidthInInt32s = lineWidthInBytes // 4
+
+enableTrace = False
+traceSizeInBytes = 8192
+traceSizeInInt32s = traceSizeInBytes // 4
+
+
+def passthroughKernel():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            # define types
+            memRef_ty = T.memref(lineWidthInBytes, T.ui8())
+
+            # AIE Core Function declarations
+            passThroughLine = external_func(
+                "passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()]
+            )
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            ComputeTile2 = tile(0, 2)
+
+            if enableTrace:
+                flow(ComputeTile2, "Trace", 0, ShimTile, "DMA", 1)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "passThrough.cc.o")
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(passThroughLine, [elemIn, elemOut, lineWidthInBytes])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            #    print(ctx.module.operation.verify())
+
+            tensorSize = N
+            tensorSizeInInt32s = tensorSize // 4
+            tensor_ty = T.memref(lineWidthInInt32s, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(inTensor, outTensor, notUsed):
+                if enableTrace:
+                    # Trace output
+
+                    # Trace_Event0, Trace_Event1: Select which events to trace.
+                    # Note that the event buffers only appear to be transferred to DDR in
+                    # bursts of 256 bytes. If less than 256 bytes are written, you may not
+                    # see trace output, or only see it on the next iteration of your
+                    # kernel invocation, as the buffer gets filled up. Note that, even
+                    # though events are encoded as 4 byte words, it may take more than 64
+                    # events to fill the buffer to 256 bytes and cause a flush, since
+                    # multiple repeating events can be 'compressed' by the trace mechanism.
+                    # In order to always generate sufficient events, we add the "assert
+                    # TRUE" event to one slot, which fires every cycle, and thus fills our
+                    # buffer quickly.
+
+                    # Some events:
+                    # TRUE                       (0x01)
+                    # STREAM_STALL               (0x18)
+                    # LOCK_STALL                 (0x1A)
+                    # EVENTS_CORE_INSTR_EVENT_1  (0x22)
+                    # EVENTS_CORE_INSTR_EVENT_0  (0x21)
+                    # INSTR_VECTOR               (0x25)  Core executes a vecotr MAC, ADD or compare instruction
+                    # INSTR_LOCK_ACQUIRE_REQ     (0x2C)  Core executes a lock acquire instruction
+                    # INSTR_LOCK_RELEASE_REQ     (0x2D)  Core executes a lock release instruction
+                    # EVENTS_CORE_PORT_RUNNING_1 (0x4F)
+                    # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
+
+                    # Trace_Event0  (4 slots)
+                    IpuWrite32(0, 2, 0x340E0, 0x4B222125)
+                    # Trace_Event1  (4 slots)
+                    IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)
+
+                    # Event slots as configured above:
+                    # 0: Kernel executes vector instruction
+                    # 1: Event 0 -- Kernel starts
+                    # 2: Event 1 -- Kernel done
+                    # 3: Port_Running_0
+                    # 4: Port_Running_1
+                    # 5: Lock Stall
+                    # 6: Lock Acquire Instr
+                    # 7: Lock Release Instr
+
+                    # Stream_Switch_Event_Port_Selection_0
+                    # This is necessary to capture the Port_Running_0 and Port_Running_1 events
+                    IpuWrite32(0, 2, 0x3FF00, 0x121)
+
+                    # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
+                    IpuWrite32(0, 2, 0x340D0, 0x10000)
+
+                    # Start trace copy out.
+                    IpuWriteBdShimTile(
+                        bd_id=3,
+                        buffer_length=traceSizeInBytes,
+                        buffer_offset=tensorSize,
+                        enable_packet=0,
+                        out_of_order_id=0,
+                        packet_id=0,
+                        packet_type=0,
+                        column=0,
+                        column_num=1,
+                        d0_stride=0,
+                        d0_wrap=0,
+                        d1_stride=0,
+                        d1_wrap=0,
+                        d2_stride=0,
+                        ddr_id=2,
+                        iteration_current=0,
+                        iteration_stride=0,
+                        iteration_wrap=0,
+                        lock_acq_enable=0,
+                        lock_acq_id=0,
+                        lock_acq_val=0,
+                        lock_rel_id=0,
+                        lock_rel_val=0,
+                        next_bd=0,
+                        use_next_bd=0,
+                        valid_bd=1,
+                    )
+                    IpuWrite32(0, 0, 0x1D20C, 0x3)
+
+                ipu_dma_memcpy_nd(
+                    metadata="in",
+                    bd_id=0,
+                    mem=inTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_dma_memcpy_nd(
+                    metadata="out",
+                    bd_id=1,
+                    mem=outTensor,
+                    sizes=[1, 1, 1, tensorSizeInInt32s],
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+passthroughKernel()
diff --git a/programming_examples/basic/passthrough_kernel/run.lit b/programming_examples/basic/passthrough_kernel/run.lit
new file mode 100644
index 0000000000..ba521da236
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/run.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o
+// RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp
new file mode 100644
index 0000000000..7b8779ca13
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/test.cpp
@@ -0,0 +1,118 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using DATATYPE = std::uint8_t;
+#endif
+
+namespace po = boost::program_options;
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  test_utils::parse_options(argc, argv, desc, vm);
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT context and load the kernel
+  xrt::device device;
+  xrt::kernel kernel;
+
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
+
+  // set up the buffer objects
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inA = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_out = xrt::bo(device, PASSTHROUGH_SIZE * sizeof(DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Copy instruction stream to xrt buffer object
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize buffer bo_inA
+  DATATYPE *bufInA = bo_inA.map<DATATYPE *>();
+  for (int i = 0; i < PASSTHROUGH_SIZE; i++)
+    bufInA[i] = i;
+
+  // Zero out buffer bo_out
+  DATATYPE *bufOut = bo_out.map<DATATYPE *>();
+  memset(bufOut, 0, PASSTHROUGH_SIZE * sizeof(DATATYPE));
+
+  // sync host to device memories
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // Execute the kernel and wait to finish
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_out);
+  run.wait();
+
+  // Sync device to host memories
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  // Compare out to in
+  int errors = 0;
+  for (int i = 0; i < PASSTHROUGH_SIZE; i++) {
+    if (bufOut[i] != bufInA[i])
+      errors++;
+  }
+
+  // Print Pass/Fail result of our test
+  if (!errors) {
+    std::cout << std::endl << "PASS!" << std::endl << std::endl;
+    return 0;
+  } else {
+    std::cout << std::endl
+              << errors << " mismatches." << std::endl
+              << std::endl;
+    std::cout << std::endl << "fail." << std::endl << std::endl;
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/relu/CMakeLists.txt b/programming_examples/basic/relu/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/relu/CMakeLists.txt
+++ b/programming_examples/basic/relu/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/relu/Makefile b/programming_examples/basic/relu/Makefile
index f862aad403..87e836fbfb 100644
--- a/programming_examples/basic/relu/Makefile
+++ b/programming_examples/basic/relu/Makefile
@@ -6,15 +6,15 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
-
-targetname = testRelu
+include ../../makefile-common
 
 all: build/final.xclbin build/insts.txt
 
-build/bf16_relu.o: ${REPO_ROOT}/aie_kernels/relu.cc
+targetname = testRelu
+
+build/bf16_relu.o: ../../../aie_kernels/relu.cc
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $< -o ${@F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c ../$< -o ${@F}
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
@@ -28,7 +28,7 @@ build/final.xclbin: build/aie.mlir build/bf16_relu.o
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
@@ -42,7 +42,6 @@ run: ${targetname}.exe build/final.xclbin build/insts.txt
 run_g: ${targetname}.exe build/final.xclbin build/insts.txt 
 	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --trace_sz 65536
 
-
 trace:
 	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
 
diff --git a/programming_examples/basic/vector_add_reduce/CMakeLists.txt b/programming_examples/basic/vector_add_reduce/CMakeLists.txt
new file mode 100644
index 0000000000..76d48dfe36
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
\ No newline at end of file
diff --git a/programming_examples/basic/vector_add_reduce/Makefile b/programming_examples/basic/vector_add_reduce/Makefile
new file mode 100644
index 0000000000..37ca25abec
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/Makefile
@@ -0,0 +1,76 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+ACDC_AIE = $(dir $(shell which aie-opt))/..
+
+targetname = vector_max
+devicename = ipu
+col = 0
+CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
+KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/aie2/
+
+all: build/final.xclbin build/insts.txt
+
+build/i32_add_reduce.o: ${KERNEL_LIB}/i32_add_reduce.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir build/i32_add_reduce.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS}
+vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/
+
+vck5000: build/aie.mlir build/scale.o
+	cp build/scale.o* ./
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I/opt/xaiengine/include \
+		-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            	-I${ROCM_ROOT}/include \
+		-L/opt/xaiengine/lib \
+            	-L/lib/x86_64-linux-gnu/ \
+		test_vck5000.cpp \
+		$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            	${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+		-Wl,-R/opt/xaiengine/lib \
+            	-Wl,-rpath,${ROCM_ROOT}/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+clean: clean_trace
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc
\ No newline at end of file
diff --git a/programming_examples/basic/vector_add_reduce/aie2.py b/programming_examples/basic/vector_add_reduce/aie2.py
new file mode 100644
index 0000000000..2e0f980235
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/aie2.py
@@ -0,0 +1,90 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import sys
+
+
+def my_vector_max():
+    N = 1024
+
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        if len(sys.argv) != 3:
+            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+        if sys.argv[1] == "ipu":
+            dev = AIEDevice.ipu
+        elif sys.argv[1] == "xcvc1902":
+            dev = AIEDevice.xcvc1902
+        else:
+            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+        @device(dev)
+        def device_body():
+            memRef_I_ty = T.memref(N, T.i32())
+            memRef_O_ty = T.memref(1, T.i32())
+
+            # AIE Core Function declarations
+
+            i32_add_reduce_vector = external_func(
+                "i32_add_reduce_vector", inputs=[memRef_I_ty, memRef_O_ty]
+            )
+
+            i32_add_reduce_scalar = external_func(
+                "i32_add_reduce_scalar", inputs=[memRef_I_ty, memRef_O_ty]
+            )
+
+            # Tile declarations
+            ShimTile = tile(int(sys.argv[2]), 0)
+            ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty
+            )
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "i32_add_reduce.o")
+            def core_body():
+                for _ in for_(0xFFFFFFFF):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+
+                    call(
+                        i32_add_reduce_vector,
+                        [elem_in, elem_out],
+                    )
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_vector_max()
diff --git a/programming_examples/basic/vector_add_reduce/run.lit b/programming_examples/basic/vector_add_reduce/run.lit
new file mode 100644
index 0000000000..b29f36cc11
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/run.lit
@@ -0,0 +1,13 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
+// RUN: mkdir build
+// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/programming_examples/basic/vector_add_reduce/run_vck5000.lit b/programming_examples/basic/vector_add_reduce/run_vck5000.lit
new file mode 100644
index 0000000000..d314eea2a4
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/run_vck5000.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa
+//
+// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir
+// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib
+// RUN: %run_on_vck5000 ./test.elf
+
diff --git a/programming_examples/basic/vector_add_reduce/test.cpp b/programming_examples/basic/vector_add_reduce/test.cpp
new file mode 100644
index 0000000000..bd7438a0f9
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/test.cpp
@@ -0,0 +1,261 @@
+///===- test.cpp -------------------------------------------000---*- C++
+///-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+
+using INOUT0_DATATYPE = std::int32_t;
+using INOUT1_DATATYPE = std::int32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = 1;    // Not used in this example
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::int32_t max = (std::int32_t)-2147483648;
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    std::int32_t next = test_utils::random_int32_t(100000);
+    if (next > max)
+      max = next;
+    bufInOut0[i] = next;
+  }
+  // Initialize Inout buffer 1
+  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+  // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME +
+  // trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      if (bufInOut1[0] != max) {
+        errors++;
+        std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl;
+      }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/vector_add_reduce/test_vck5000.cpp b/programming_examples/basic/vector_add_reduce/test_vck5000.cpp
new file mode 100644
index 0000000000..25de4c7823
--- /dev/null
+++ b/programming_examples/basic/vector_add_reduce/test_vck5000.cpp
@@ -0,0 +1,146 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2020-2022, Xilinx Inc.
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <xaiengine.h>
+
+#include "memory_allocator.h"
+#include "test_library.h"
+
+#include "aie_data_movement.cpp"
+#include "aie_inc.cpp"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+constexpr int DMA_COUNT = 64;
+
+void hsa_check_status(const std::string func_name, hsa_status_t status) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string(new char[1024]);
+    hsa_status_string(status, &status_string);
+    std::cout << func_name << " failed: " << status_string << std::endl;
+    delete[] status_string;
+  } else {
+    std::cout << func_name << " success" << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t row = 0;
+  uint64_t col = 6;
+
+  std::vector<hsa_queue_t *> queues;
+  uint32_t aie_max_queue_size(0);
+
+  aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie();
+
+  // This is going to initialize HSA, create a queue
+  // and get an agent
+  int ret = mlir_aie_init_device(xaie);
+
+  if (ret) {
+    std::cout << "[ERROR] Error when calling mlir_aie_init_device)"
+              << std::endl;
+    return -1;
+  }
+
+  // Getting access to all of the HSA agents
+  std::vector<hsa_agent_t> agents = xaie->agents;
+
+  if (agents.empty()) {
+    std::cout << "No agents found. Exiting." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Found " << agents.size() << " agents" << std::endl;
+
+  hsa_queue_t *q = xaie->cmd_queue;
+
+  // Adding to our vector of queues
+  queues.push_back(q);
+  assert(queues.size() > 0 && "No queues were sucesfully created!");
+
+  mlir_aie_configure_cores(xaie);
+  mlir_aie_configure_switchboxes(xaie);
+  mlir_aie_initialize_locks(xaie);
+  mlir_aie_configure_dmas(xaie);
+  mlir_aie_start_cores(xaie);
+
+  // Allocating some device memory
+  ext_mem_model_t buf0, buf1, buf2;
+  uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT);
+  uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT);
+  uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(
+      xaie, buf2, 4 /* For some reason can't do 1 */);
+  mlir_aie_sync_mem_dev(buf0);
+  mlir_aie_sync_mem_dev(buf1);
+  mlir_aie_sync_mem_dev(buf2);
+
+  if (in_a == nullptr || in_b == nullptr || out == nullptr) {
+    std::cout << "Could not allocate in device memory" << std::endl;
+    return -1;
+  }
+
+  out[0] = 0xdeface;
+  for (int i = 0; i < DMA_COUNT; i++) {
+    in_a[i] = i + 1;
+  }
+
+  in_a[DMA_COUNT / 2] = 123456;
+  in_a[DMA_COUNT - 1] = 100;
+
+  // printf("[EDDIE DEBUG] max_val before data movement is %d\n",
+  // mlir_aie_read_buffer_max_val(xaie, 0));
+
+  // Pass arguments in the order of dma_memcpys in the mlir
+  invoke_data_movement(queues[0], &agents[0], out, in_a);
+
+  int errors = 0;
+
+  uint32_t max_val = 0;
+  for (int i = 0; i < DMA_COUNT; i++) {
+    uint32_t s = in_a[i];
+    if (max_val < s) {
+      max_val = s;
+    }
+  }
+
+  // printf("[EDDIE DEBUG] max_val before data movement is %d\n",
+  // mlir_aie_read_buffer_max_val(xaie, 0));
+
+  if (*out != max_val) {
+    errors++;
+    printf("[ERROR] Maximum value is %d but kernel returned %d\n", max_val,
+           *out);
+  }
+
+  // destroying the queue
+  hsa_queue_destroy(queues[0]);
+
+  // Shutdown AIR and HSA
+  mlir_aie_deinit_libxaie(xaie);
+
+  if (!errors) {
+    printf("PASS!\n");
+    return 0;
+  } else {
+    printf("fail %d/%d.\n", errors, 1);
+    return -1;
+  }
+}
diff --git a/programming_examples/basic/vector_bias_add/CMakeLists.txt b/programming_examples/basic/vector_bias_add/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/vector_bias_add/CMakeLists.txt
+++ b/programming_examples/basic/vector_bias_add/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_bias_add/Makefile b/programming_examples/basic/vector_bias_add/Makefile
index 14e58914be..80ee07528b 100644
--- a/programming_examples/basic/vector_bias_add/Makefile
+++ b/programming_examples/basic/vector_bias_add/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 all: build/final.xclbin
 
diff --git a/programming_examples/basic/vector_exp/CMakeLists.txt b/programming_examples/basic/vector_exp/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/vector_exp/CMakeLists.txt
+++ b/programming_examples/basic/vector_exp/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile
index 9effbdadfb..02860c4f31 100755
--- a/programming_examples/basic/vector_exp/Makefile
+++ b/programming_examples/basic/vector_exp/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 targetname = testExp
 
@@ -17,15 +17,15 @@ build/vecexp.cc: bf16_exp_lut.mlir
 	cd ${@D} &&	aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc
 
 build/vecexp.o: build/vecexp.cc
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
 build/lut_based_ops.o:
 	mkdir -p ${@D}
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
 
 build/exp.o: exp.cc
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
 build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o
 	ar rvs $@ $+
@@ -42,7 +42,7 @@ build/final.xclbin: build/aie.mlir build/kernels.a
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/basic/vector_max/CMakeLists.txt b/programming_examples/basic/vector_max/CMakeLists.txt
index d9f511062f..c64f84842b 100644
--- a/programming_examples/basic/vector_max/CMakeLists.txt
+++ b/programming_examples/basic/vector_max/CMakeLists.txt
@@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_max/Makefile b/programming_examples/basic/vector_max/Makefile
index 4f5f1d5522..4450cdd20f 100755
--- a/programming_examples/basic/vector_max/Makefile
+++ b/programming_examples/basic/vector_max/Makefile
@@ -10,18 +10,26 @@ include ../makefile-common
 
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
-SHELL := /bin/bash
-
-targetname = vectorMax
+targetname = vector_max
 devicename = ipu
 col = 0
+CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
+KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/
 
-all: build/final.xclbin
+all: build/final.xclbin build/insts.txt
 
-build/final.xclbin: build/aie.mlir
+build/vector_max.o: ${KERNEL_LIB}/vector_max.cc
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir build/vector_max.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
@@ -34,30 +42,35 @@ else
 	cp _build/${targetname} $@ 
 endif
 
-build/aie.mlir: aie2.py
-	mkdir -p ${@D}
-	python3 $< ${devicename} ${col} > $@
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
 
 # Changing variables when we target VCK5000
 vck5000: devicename=xcvc1902
 vck5000: col=6
+vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS}
+vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/
 
-vck5000: build/aie.mlir
+vck5000: build/aie.mlir build/scale.o
+	cp build/scale.o* ./
 	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
-						-I/opt/xaiengine/include \
-						-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
-            -I${ROCM_ROOT}/include \
-						-L/opt/xaiengine/lib \
-            -L/lib/x86_64-linux-gnu/ \
-						test_vck5000.cpp \
-						$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
-            ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
-						-Wl,-R/opt/xaiengine/lib \
-            -Wl,-rpath,${ROCM_ROOT}/lib \
-						-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
-
-run: ${targetname}.exe build/final.xclbin build/insts.txt 
-	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+		-I/opt/xaiengine/include \
+		-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            	-I${ROCM_ROOT}/include \
+		-L/opt/xaiengine/lib \
+            	-L/lib/x86_64-linux-gnu/ \
+		test_vck5000.cpp \
+		$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            	${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+		-Wl,-R/opt/xaiengine/lib \
+            	-Wl,-rpath,${ROCM_ROOT}/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
 
-clean:
-	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
+clean: clean_trace
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc
diff --git a/programming_examples/basic/vector_max/aie2.py b/programming_examples/basic/vector_max/aie2.py
index 5d094a621d..fdc5deba52 100755
--- a/programming_examples/basic/vector_max/aie2.py
+++ b/programming_examples/basic/vector_max/aie2.py
@@ -17,7 +17,7 @@
 
 
 def my_vector_max():
-    N = 64
+    N = 1024
 
     buffer_depth = 2
 
@@ -35,39 +35,38 @@ def my_vector_max():
 
         @device(dev)
         def device_body():
-            memRef_ty = T.memref(N, T.i32())
+            memRef_I_ty = T.memref(N, T.i32())
+            memRef_O_ty = T.memref(1, T.i32())
 
             # AIE Core Function declarations
 
+            vector_max = external_func("vector_max", inputs=[memRef_I_ty, memRef_O_ty])
+
+            scalar_max = external_func("scalar_max", inputs=[memRef_I_ty, memRef_O_ty])
+
             # Tile declarations
             ShimTile = tile(int(sys.argv[2]), 0)
             ComputeTile2 = tile(int(sys.argv[2]), 2)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty
+            )
 
             # Set up compute tiles
 
             # Compute tile 2
-            @core(ComputeTile2)
+            @core(ComputeTile2, "vector_max.o")
             def core_body():
-                max_val = memref.alloc(1, T.i32())
-                memref.store(arith.constant(0, T.i32()), max_val, [0])
-                # Effective while(1)
-                for _ in for_(sys.maxsize):
-                    # Number of sub-vector "tile" iterations
-                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+                for _ in for_(0xFFFFFFFF):
                     elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    for i in for_(N):
-                        v0 = memref.load(elem_in, [i])
-                        v1 = memref.load(max_val, [0])
-                        v2 = arith.maxui(v1, v0)
-                        memref.store(v2, max_val, [0])
-                        yield_([])
-
-                    v3 = memref.load(max_val, [0])
-                    memref.store(v3, elem_out, [0])
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+
+                    call(
+                        vector_max,
+                        [elem_in, elem_out],
+                    )
                     of_in.release(ObjectFifoPort.Consume, 1)
                     of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
@@ -75,8 +74,8 @@ def core_body():
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(A, B, C):
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
                 ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
                 ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
                 ipu_sync(column=0, row=0, direction=0, channel=0)
diff --git a/programming_examples/basic/vector_max/run.lit b/programming_examples/basic/vector_max/run.lit
index a429e99221..b29f36cc11 100644
--- a/programming_examples/basic/vector_max/run.lit
+++ b/programming_examples/basic/vector_max/run.lit
@@ -4,6 +4,8 @@
 // REQUIRES: ryzen_ai
 //
 // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
+// RUN: mkdir build
+// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o
 // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
 // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
diff --git a/programming_examples/basic/vector_max/test.cpp b/programming_examples/basic/vector_max/test.cpp
index ffe277e862..bd7438a0f9 100644
--- a/programming_examples/basic/vector_max/test.cpp
+++ b/programming_examples/basic/vector_max/test.cpp
@@ -1,4 +1,5 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+///===- test.cpp -------------------------------------------000---*- C++
+///-*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,9 +9,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <bits/stdc++.h>
 #include <boost/program_options.hpp>
 #include <cstdint>
-#include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -21,79 +22,59 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-constexpr int IN_SIZE = 64;
-constexpr int OUT_SIZE = 1;
+#include "test_utils.h"
 
-namespace po = boost::program_options;
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
+using INOUT0_DATATYPE = std::int32_t;
+using INOUT1_DATATYPE = std::int32_t;
+#endif
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
+namespace po = boost::program_options;
 
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
 int main(int argc, const char *argv[]) {
 
-  // Program arguments parsing
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
   po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
+  test_utils::add_default_options(desc);
 
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = 1;    // Not used in this example
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
 
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
 
-  int verbosity = vm["verbosity"].as<int>();
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // Start the XRT test code
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
   // Get a device handle
   unsigned int device_index = 0;
   auto device = xrt::device(device_index);
@@ -103,6 +84,7 @@ int main(int argc, const char *argv[]) {
     std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
   auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
 
+  // Load the kernel
   if (verbosity >= 1)
     std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
   std::string Node = vm["kernel"].as<std::string>();
@@ -110,85 +92,170 @@ int main(int argc, const char *argv[]) {
   // Get the kernel from the xclbin
   auto xkernels = xclbin.get_kernels();
   auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
                                  auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
                                  return name.rfind(Node, 0) == 0;
                                });
   auto kernelName = xkernel.get_name();
 
+  // Register xclbin
   if (verbosity >= 1)
     std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
               << "\n";
-
   device.register_xclbin(xclbin);
 
-  // get a hardware context
+  // Get a hardware context
   if (verbosity >= 1)
     std::cout << "Getting hardware context.\n";
   xrt::hw_context context(device, xclbin.get_uuid());
 
-  // get a kernel handle
+  // Get a kernel handle
   if (verbosity >= 1)
     std::cout << "Getting handle to kernel:" << kernelName << "\n";
   auto kernel = xrt::kernel(context, kernelName);
 
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
 
-  int32_t *bufInA = bo_inA.map<int32_t *>();
-  std::vector<uint32_t> srcVecA;
-  for (int i = 0; i < IN_SIZE; i++)
-    srcVecA.push_back(i + 1);
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
-
-  bufInA[IN_SIZE / 2] = 654321;
-  bufInA[IN_SIZE - 1] = 100;
-
+  // Initialize instruction buffer
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::int32_t max = (std::int32_t)-2147483648;
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    std::int32_t next = test_utils::random_int32_t(100000);
+    if (next > max)
+      max = next;
+    bufInOut0[i] = next;
+  }
+  // Initialize Inout buffer 1
+  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+  // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME +
+  // trace_size
+
+  // Sync buffers to update input buffer values
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
-  if (verbosity >= 1)
-    std::cout << "Running Kernel.\n";
-  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
 
-  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  int errors = 0;
 
-  uint32_t *bufOut = bo_out.map<uint32_t *>();
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
 
-  int errors = 0;
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
 
-  uint32_t max_val = 0;
-  for (uint32_t i = 0; i < IN_SIZE; i++) {
-    if (*(bufInA + i) > max_val) {
-      max_val = *(bufInA + i);
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      if (bufInOut1[0] != max) {
+        errors++;
+        std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl;
+      }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
     }
-  }
 
-  if (*bufOut != max_val) {
-    std::cout << "[ERROR] Maximum value is " << max_val
-              << " but kernel returned " << *bufOut << "\n";
-    errors++;
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
   }
 
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
   if (!errors) {
     std::cout << "\nPASS!\n\n";
     return 0;
   } else {
-    std::cout << "\nfailed.\n\n";
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
     return 1;
   }
 }
diff --git a/programming_examples/basic/passthrough_hardware/CMakeLists.txt b/programming_examples/basic/vector_max_reduce/CMakeLists.txt
similarity index 96%
rename from programming_examples/basic/passthrough_hardware/CMakeLists.txt
rename to programming_examples/basic/vector_max_reduce/CMakeLists.txt
index 58cacd33af..c64f84842b 100644
--- a/programming_examples/basic/passthrough_hardware/CMakeLists.txt
+++ b/programming_examples/basic/vector_max_reduce/CMakeLists.txt
@@ -27,7 +27,7 @@ endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
 
-SET (ProjectName proj_${TARGET_NAME})
+SET (ProjectName ${TARGET_NAME})
 SET (currentTarget ${TARGET_NAME})
 
 if ( WSL )
@@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_max_reduce/Makefile b/programming_examples/basic/vector_max_reduce/Makefile
new file mode 100755
index 0000000000..981856f203
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/Makefile
@@ -0,0 +1,76 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+ACDC_AIE = $(dir $(shell which aie-opt))/..
+
+targetname = vector_max
+devicename = ipu
+col = 0
+CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
+KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/
+
+all: build/final.xclbin build/insts.txt
+
+build/vector_max.o: ${KERNEL_LIB}/vector_max.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir build/vector_max.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS}
+vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/
+
+vck5000: build/aie.mlir build/scale.o
+	cp build/scale.o* ./
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I/opt/xaiengine/include \
+		-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            	-I${ROCM_ROOT}/include \
+		-L/opt/xaiengine/lib \
+            	-L/lib/x86_64-linux-gnu/ \
+		test_vck5000.cpp \
+		$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            	${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+		-Wl,-R/opt/xaiengine/lib \
+            	-Wl,-rpath,${ROCM_ROOT}/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+clean: clean_trace
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc
\ No newline at end of file
diff --git a/programming_examples/basic/vector_max_reduce/README.md b/programming_examples/basic/vector_max_reduce/README.md
new file mode 100644
index 0000000000..50dc039a51
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/README.md
@@ -0,0 +1,29 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Vector max</ins>
+
+This reference design can be run on either a RyzenAI IPU or a VCK5000.
+
+Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000.
+
+
+To compile and run the design for IPU:
+```
+make
+make run
+```
+
+To compile and run the design for VCK5000:
+```
+make vck5000
+./test.elf
+```
+
diff --git a/programming_examples/basic/vector_max_reduce/aie2.py b/programming_examples/basic/vector_max_reduce/aie2.py
new file mode 100755
index 0000000000..fdc5deba52
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/aie2.py
@@ -0,0 +1,86 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import sys
+
+
+def my_vector_max():
+    N = 1024
+
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        if len(sys.argv) != 3:
+            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+        if sys.argv[1] == "ipu":
+            dev = AIEDevice.ipu
+        elif sys.argv[1] == "xcvc1902":
+            dev = AIEDevice.xcvc1902
+        else:
+            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+        @device(dev)
+        def device_body():
+            memRef_I_ty = T.memref(N, T.i32())
+            memRef_O_ty = T.memref(1, T.i32())
+
+            # AIE Core Function declarations
+
+            vector_max = external_func("vector_max", inputs=[memRef_I_ty, memRef_O_ty])
+
+            scalar_max = external_func("scalar_max", inputs=[memRef_I_ty, memRef_O_ty])
+
+            # Tile declarations
+            ShimTile = tile(int(sys.argv[2]), 0)
+            ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty
+            )
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "vector_max.o")
+            def core_body():
+                for _ in for_(0xFFFFFFFF):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+
+                    call(
+                        vector_max,
+                        [elem_in, elem_out],
+                    )
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_vector_max()
diff --git a/programming_examples/basic/vector_max_reduce/run.lit b/programming_examples/basic/vector_max_reduce/run.lit
new file mode 100644
index 0000000000..b29f36cc11
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/run.lit
@@ -0,0 +1,13 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
+// RUN: mkdir build
+// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/programming_examples/basic/vector_max_reduce/run_vck5000.lit b/programming_examples/basic/vector_max_reduce/run_vck5000.lit
new file mode 100644
index 0000000000..d314eea2a4
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/run_vck5000.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa
+//
+// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir
+// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib
+// RUN: %run_on_vck5000 ./test.elf
+
diff --git a/programming_examples/basic/vector_max_reduce/test.cpp b/programming_examples/basic/vector_max_reduce/test.cpp
new file mode 100644
index 0000000000..bd7438a0f9
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/test.cpp
@@ -0,0 +1,261 @@
+///===- test.cpp -------------------------------------------000---*- C++
+///-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+
+using INOUT0_DATATYPE = std::int32_t;
+using INOUT1_DATATYPE = std::int32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = 1;    // Not used in this example
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::int32_t max = (std::int32_t)-2147483648;
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    std::int32_t next = test_utils::random_int32_t(100000);
+    if (next > max)
+      max = next;
+    bufInOut0[i] = next;
+  }
+  // Initialize Inout buffer 1
+  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+  // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME +
+  // trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      if (bufInOut1[0] != max) {
+        errors++;
+        std::cout << "max is " << max << " calc " << bufInOut1[0] << std::endl;
+      }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/vector_max_reduce/test_vck5000.cpp b/programming_examples/basic/vector_max_reduce/test_vck5000.cpp
new file mode 100644
index 0000000000..25de4c7823
--- /dev/null
+++ b/programming_examples/basic/vector_max_reduce/test_vck5000.cpp
@@ -0,0 +1,146 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2020-2022, Xilinx Inc.
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <xaiengine.h>
+
+#include "memory_allocator.h"
+#include "test_library.h"
+
+#include "aie_data_movement.cpp"
+#include "aie_inc.cpp"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+constexpr int DMA_COUNT = 64;
+
+void hsa_check_status(const std::string func_name, hsa_status_t status) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string(new char[1024]);
+    hsa_status_string(status, &status_string);
+    std::cout << func_name << " failed: " << status_string << std::endl;
+    delete[] status_string;
+  } else {
+    std::cout << func_name << " success" << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t row = 0;
+  uint64_t col = 6;
+
+  std::vector<hsa_queue_t *> queues;
+  uint32_t aie_max_queue_size(0);
+
+  aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie();
+
+  // This is going to initialize HSA, create a queue
+  // and get an agent
+  int ret = mlir_aie_init_device(xaie);
+
+  if (ret) {
+    std::cout << "[ERROR] Error when calling mlir_aie_init_device)"
+              << std::endl;
+    return -1;
+  }
+
+  // Getting access to all of the HSA agents
+  std::vector<hsa_agent_t> agents = xaie->agents;
+
+  if (agents.empty()) {
+    std::cout << "No agents found. Exiting." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Found " << agents.size() << " agents" << std::endl;
+
+  hsa_queue_t *q = xaie->cmd_queue;
+
+  // Adding to our vector of queues
+  queues.push_back(q);
+  assert(queues.size() > 0 && "No queues were sucesfully created!");
+
+  mlir_aie_configure_cores(xaie);
+  mlir_aie_configure_switchboxes(xaie);
+  mlir_aie_initialize_locks(xaie);
+  mlir_aie_configure_dmas(xaie);
+  mlir_aie_start_cores(xaie);
+
+  // Allocating some device memory
+  ext_mem_model_t buf0, buf1, buf2;
+  uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT);
+  uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT);
+  uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(
+      xaie, buf2, 4 /* For some reason can't do 1 */);
+  mlir_aie_sync_mem_dev(buf0);
+  mlir_aie_sync_mem_dev(buf1);
+  mlir_aie_sync_mem_dev(buf2);
+
+  if (in_a == nullptr || in_b == nullptr || out == nullptr) {
+    std::cout << "Could not allocate in device memory" << std::endl;
+    return -1;
+  }
+
+  out[0] = 0xdeface;
+  for (int i = 0; i < DMA_COUNT; i++) {
+    in_a[i] = i + 1;
+  }
+
+  in_a[DMA_COUNT / 2] = 123456;
+  in_a[DMA_COUNT - 1] = 100;
+
+  // printf("[EDDIE DEBUG] max_val before data movement is %d\n",
+  // mlir_aie_read_buffer_max_val(xaie, 0));
+
+  // Pass arguments in the order of dma_memcpys in the mlir
+  invoke_data_movement(queues[0], &agents[0], out, in_a);
+
+  int errors = 0;
+
+  uint32_t max_val = 0;
+  for (int i = 0; i < DMA_COUNT; i++) {
+    uint32_t s = in_a[i];
+    if (max_val < s) {
+      max_val = s;
+    }
+  }
+
+  // printf("[EDDIE DEBUG] max_val before data movement is %d\n",
+  // mlir_aie_read_buffer_max_val(xaie, 0));
+
+  if (*out != max_val) {
+    errors++;
+    printf("[ERROR] Maximum value is %d but kernel returned %d\n", max_val,
+           *out);
+  }
+
+  // destroying the queue
+  hsa_queue_destroy(queues[0]);
+
+  // Shutdown AIR and HSA
+  mlir_aie_deinit_libxaie(xaie);
+
+  if (!errors) {
+    printf("PASS!\n");
+    return 0;
+  } else {
+    printf("fail %d/%d.\n", errors, 1);
+    return -1;
+  }
+}
diff --git a/programming_examples/basic/vector_min/CMakeLists.txt b/programming_examples/basic/vector_min/CMakeLists.txt
index d9f511062f..c64f84842b 100644
--- a/programming_examples/basic/vector_min/CMakeLists.txt
+++ b/programming_examples/basic/vector_min/CMakeLists.txt
@@ -48,6 +48,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_min/Makefile b/programming_examples/basic/vector_min/Makefile
index 5dec01e083..a5d96b0149 100755
--- a/programming_examples/basic/vector_min/Makefile
+++ b/programming_examples/basic/vector_min/Makefile
@@ -10,18 +10,26 @@ include ../makefile-common
 
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
-SHELL := /bin/bash
-
-targetname = vectorMin
+targetname = vector_min
 devicename = ipu
 col = 0
+CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
+KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/
 
-all: build/final.xclbin
+all: build/final.xclbin build/insts.txt
 
-build/final.xclbin: build/aie.mlir
+build/vector_min.o: ${KERNEL_LIB}/vector_min.cc
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir build/vector_min.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
@@ -34,30 +42,35 @@ else
 	cp _build/${targetname} $@ 
 endif
 
-build/aie.mlir: aie2.py
-	mkdir -p ${@D}
-	python3 $< ${devicename} ${col} > $@
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
 
 # Changing variables when we target VCK5000
 vck5000: devicename=xcvc1902
 vck5000: col=6
+vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS}
+vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/
 
-vck5000: build/aie.mlir
+vck5000: build/aie.mlir build/scale.o
+	cp build/scale.o* ./
 	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
-						-I/opt/xaiengine/include \
-						-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
-            -I${ROCM_ROOT}/include \
-						-L/opt/xaiengine/lib \
-            -L/lib/x86_64-linux-gnu/ \
-						test_vck5000.cpp \
-						$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
-            ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
-						-Wl,-R/opt/xaiengine/lib \
-            -Wl,-rpath,${ROCM_ROOT}/lib \
-						-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
-
-run: ${targetname}.exe build/final.xclbin build/insts.txt 
-	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+		-I/opt/xaiengine/include \
+		-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            	-I${ROCM_ROOT}/include \
+		-L/opt/xaiengine/lib \
+            	-L/lib/x86_64-linux-gnu/ \
+		test_vck5000.cpp \
+		$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            	${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+		-Wl,-R/opt/xaiengine/lib \
+            	-Wl,-rpath,${ROCM_ROOT}/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
 
-clean:
-	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
+clean: clean_trace
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_min.o* vector_min.cc
diff --git a/programming_examples/basic/vector_min/Makefile.pjr b/programming_examples/basic/vector_min/Makefile.pjr
new file mode 100755
index 0000000000..9b0ffca19e
--- /dev/null
+++ b/programming_examples/basic/vector_min/Makefile.pjr
@@ -0,0 +1,67 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../makefile-common
+
+ACDC_AIE = $(dir $(shell which aie-opt))/..
+
+SHELL := /bin/bash
+
+targetname = vectorMin
+devicename = ipu
+col = 0
+
+all: build/final.xclbin
+
+build/vector_min.o:
+	mkdir -p ${@D}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/generic/vector_min.cc -o ${@F}
+
+build/final.xclbin: build/aie.mlir build/vector_min.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+
+vck5000: build/aie.mlir
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+						-I/opt/xaiengine/include \
+						-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            -I${ROCM_ROOT}/include \
+						-L/opt/xaiengine/lib \
+            -L/lib/x86_64-linux-gnu/ \
+						test_vck5000.cpp \
+						$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            ${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+						-Wl,-R/opt/xaiengine/lib \
+            -Wl,-rpath,${ROCM_ROOT}/lib \
+						-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
diff --git a/programming_examples/basic/vector_min/README.md b/programming_examples/basic/vector_min/README.md
index 3581b595ee..50dc039a51 100644
--- a/programming_examples/basic/vector_min/README.md
+++ b/programming_examples/basic/vector_min/README.md
@@ -8,11 +8,11 @@
 // 
 //===----------------------------------------------------------------------===//-->
 
-# <ins>Vector min</ins>
+# <ins>Vector max</ins>
 
 This reference design can be run on either a RyzenAI IPU or a VCK5000.
 
-Single tile traverses through a vector in memory and returns the min value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary min value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000.
+Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000.
 
 
 To compile and run the design for IPU:
diff --git a/programming_examples/basic/vector_min/aie2.py b/programming_examples/basic/vector_min/aie2.py
index f0b931431a..c235b7f278 100755
--- a/programming_examples/basic/vector_min/aie2.py
+++ b/programming_examples/basic/vector_min/aie2.py
@@ -17,7 +17,7 @@
 
 
 def my_vector_max():
-    N = 64
+    N = 1024
 
     buffer_depth = 2
 
@@ -35,39 +35,38 @@ def my_vector_max():
 
         @device(dev)
         def device_body():
-            memRef_ty = T.memref(N, T.i32())
+            memRef_I_ty = T.memref(N, T.i32())
+            memRef_O_ty = T.memref(1, T.i32())
 
             # AIE Core Function declarations
 
+            vector_min = external_func("vector_min", inputs=[memRef_I_ty, memRef_O_ty])
+
+            scalar_min = external_func("scalar_min", inputs=[memRef_I_ty, memRef_O_ty])
+
             # Tile declarations
             ShimTile = tile(int(sys.argv[2]), 0)
             ComputeTile2 = tile(int(sys.argv[2]), 2)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty
+            )
 
             # Set up compute tiles
 
             # Compute tile 2
-            @core(ComputeTile2)
+            @core(ComputeTile2, "vector_min.o")
             def core_body():
-                min_val = memref.alloc(1, T.i32())
-                memref.store(arith.constant(1000, T.i32()), min_val, [0])
-                # Effective while(1)
-                for _ in for_(sys.maxsize):
-                    # Number of sub-vector "tile" iterations
-                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+                for _ in for_(0xFFFFFFFF):
                     elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    for i in for_(N):
-                        v0 = memref.load(elem_in, [i])
-                        v1 = memref.load(min_val, [0])
-                        v2 = arith.minui(v1, v0)
-                        memref.store(v2, min_val, [0])
-                        yield_([])
-
-                    v3 = memref.load(min_val, [0])
-                    memref.store(v3, elem_out, [0])
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+
+                    call(
+                        vector_min,
+                        [elem_in, elem_out],
+                    )
                     of_in.release(ObjectFifoPort.Consume, 1)
                     of_out.release(ObjectFifoPort.Produce, 1)
                     yield_([])
@@ -75,8 +74,8 @@ def core_body():
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
-            def sequence(A, B, C):
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
                 ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
                 ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
                 ipu_sync(column=0, row=0, direction=0, channel=0)
diff --git a/programming_examples/basic/vector_min/run.lit b/programming_examples/basic/vector_min/run.lit
index a429e99221..b29f36cc11 100644
--- a/programming_examples/basic/vector_min/run.lit
+++ b/programming_examples/basic/vector_min/run.lit
@@ -4,6 +4,8 @@
 // REQUIRES: ryzen_ai
 //
 // RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
+// RUN: mkdir build
+// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o
 // RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
 // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
diff --git a/programming_examples/basic/vector_min/run_vck5000.lit b/programming_examples/basic/vector_min/run_vck5000.lit
index 83b350c50d..d314eea2a4 100644
--- a/programming_examples/basic/vector_min/run_vck5000.lit
+++ b/programming_examples/basic/vector_min/run_vck5000.lit
@@ -5,5 +5,5 @@
 //
 // RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir
 // RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib
-// RUN: %run_on_vck5000 ./test.elf 
+// RUN: %run_on_vck5000 ./test.elf
 
diff --git a/programming_examples/basic/vector_min/test.cpp b/programming_examples/basic/vector_min/test.cpp
index 61339fc01e..0c596a7068 100644
--- a/programming_examples/basic/vector_min/test.cpp
+++ b/programming_examples/basic/vector_min/test.cpp
@@ -1,4 +1,5 @@
-//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+///===- test.cpp -------------------------------------------000---*- C++
+///-*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,9 +9,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <bits/stdc++.h>
 #include <boost/program_options.hpp>
 #include <cstdint>
-#include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -21,79 +22,59 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-constexpr int IN_SIZE = 64;
-constexpr int OUT_SIZE = 1;
+#include "test_utils.h"
 
-namespace po = boost::program_options;
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
+using INOUT0_DATATYPE = std::int32_t;
+using INOUT1_DATATYPE = std::int32_t;
+#endif
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
+namespace po = boost::program_options;
 
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
 int main(int argc, const char *argv[]) {
 
-  // Program arguments parsing
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
   po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
   po::variables_map vm;
+  test_utils::add_default_options(desc);
 
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = 1;    // Not used in this example
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
 
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
 
-  int verbosity = vm["verbosity"].as<int>();
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
   if (verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
-  // Start the XRT test code
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
   // Get a device handle
   unsigned int device_index = 0;
   auto device = xrt::device(device_index);
@@ -103,6 +84,7 @@ int main(int argc, const char *argv[]) {
     std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
   auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
 
+  // Load the kernel
   if (verbosity >= 1)
     std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
   std::string Node = vm["kernel"].as<std::string>();
@@ -110,85 +92,170 @@ int main(int argc, const char *argv[]) {
   // Get the kernel from the xclbin
   auto xkernels = xclbin.get_kernels();
   auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
                                  auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
                                  return name.rfind(Node, 0) == 0;
                                });
   auto kernelName = xkernel.get_name();
 
+  // Register xclbin
   if (verbosity >= 1)
     std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
               << "\n";
-
   device.register_xclbin(xclbin);
 
-  // get a hardware context
+  // Get a hardware context
   if (verbosity >= 1)
     std::cout << "Getting hardware context.\n";
   xrt::hw_context context(device, xclbin.get_uuid());
 
-  // get a kernel handle
+  // Get a kernel handle
   if (verbosity >= 1)
     std::cout << "Getting handle to kernel:" << kernelName << "\n";
   auto kernel = xrt::kernel(context, kernelName);
 
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
-                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects.\n";
 
-  int32_t *bufInA = bo_inA.map<int32_t *>();
-  std::vector<uint32_t> srcVecA;
-  for (int i = 100; i < 100 + IN_SIZE; i++)
-    srcVecA.push_back(i + 1);
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
-
-  bufInA[IN_SIZE / 2] = 4;
-  bufInA[IN_SIZE - 1] = 100;
-
+  // Initialize instruction buffer
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::int32_t min = (std::int32_t)2147483647;
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    std::int32_t next = test_utils::random_int32_t(100000);
+    if (next < min)
+      min = next;
+    bufInOut0[i] = next;
+  }
+  // Initialize Inout buffer 1
+  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+  // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME +
+  // trace_size
+
+  // Sync buffers to update input buffer values
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
-  if (verbosity >= 1)
-    std::cout << "Running Kernel.\n";
-  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
 
-  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  int errors = 0;
 
-  uint32_t *bufOut = bo_out.map<uint32_t *>();
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
 
-  int errors = 0;
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
 
-  uint32_t min_val = 1000000;
-  for (uint32_t i = 0; i < IN_SIZE; i++) {
-    if (*(bufInA + i) < min_val) {
-      min_val = *(bufInA + i);
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      if (bufInOut1[0] != min) {
+        errors++;
+        std::cout << "min is " << min << " calc " << bufInOut1[0] << std::endl;
+      }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
     }
-  }
 
-  if (*bufOut != min_val) {
-    std::cout << "[ERROR] Min value is " << min_val << " but kernel returned "
-              << *bufOut << "\n";
-    errors++;
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
   }
 
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
   if (!errors) {
     std::cout << "\nPASS!\n\n";
     return 0;
   } else {
-    std::cout << "\nfailed.\n\n";
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
     return 1;
   }
 }
diff --git a/programming_examples/basic/vector_min_reduce/CMakeLists.txt b/programming_examples/basic/vector_min_reduce/CMakeLists.txt
new file mode 100644
index 0000000000..c64f84842b
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ../../../programming_examples/utils
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/vector_min_reduce/Makefile b/programming_examples/basic/vector_min_reduce/Makefile
new file mode 100755
index 0000000000..ed85afa75f
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/Makefile
@@ -0,0 +1,76 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+ACDC_AIE = $(dir $(shell which aie-opt))/..
+
+targetname = vector_min
+devicename = ipu
+col = 0
+CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
+KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/generic/
+
+all: build/final.xclbin build/insts.txt
+
+build/vector_min.o: ${KERNEL_LIB}/vector_min.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir build/vector_min.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+trace:
+	../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json
+
+clean_trace:
+	rm -rf tmpTrace trace.txt
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+vck5000: CHESS_FLAGS=${CHESSCCWRAP1_FLAGS}
+vck5000: KERNEL_LIB=${ACDC_AIE}/../aie_kernels/aie1/
+
+vck5000: build/aie.mlir build/scale.o
+	cp build/scale.o* ./
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I/opt/xaiengine/include \
+		-I$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/include \
+            	-I${ROCM_ROOT}/include \
+		-L/opt/xaiengine/lib \
+            	-L/lib/x86_64-linux-gnu/ \
+		test_vck5000.cpp \
+		$(ACDC_AIE)/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+            	${ROCM_ROOT}/lib/libhsa-runtime64.so.1.9.0 \
+		-Wl,-R/opt/xaiengine/lib \
+            	-Wl,-rpath,${ROCM_ROOT}/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+clean: clean_trace
+	rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_min.o* vector_min.cc
\ No newline at end of file
diff --git a/programming_examples/basic/vector_min_reduce/README.md b/programming_examples/basic/vector_min_reduce/README.md
new file mode 100644
index 0000000000..50dc039a51
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/README.md
@@ -0,0 +1,29 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Vector max</ins>
+
+This reference design can be run on either a RyzenAI IPU or a VCK5000.
+
+Single tile traverses through a vector in memory and returns the maximum value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. The value of `col` is dependent on whether the application is targetting IPU or VCK5000.
+
+
+To compile and run the design for IPU:
+```
+make
+make run
+```
+
+To compile and run the design for VCK5000:
+```
+make vck5000
+./test.elf
+```
+
diff --git a/programming_examples/basic/vector_min_reduce/aie2.py b/programming_examples/basic/vector_min_reduce/aie2.py
new file mode 100755
index 0000000000..c235b7f278
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/aie2.py
@@ -0,0 +1,86 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import sys
+
+
+def my_vector_max():
+    N = 1024
+
+    buffer_depth = 2
+
+    with mlir_mod_ctx() as ctx:
+
+        if len(sys.argv) != 3:
+            raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+
+        if sys.argv[1] == "ipu":
+            dev = AIEDevice.ipu
+        elif sys.argv[1] == "xcvc1902":
+            dev = AIEDevice.xcvc1902
+        else:
+            raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+        @device(dev)
+        def device_body():
+            memRef_I_ty = T.memref(N, T.i32())
+            memRef_O_ty = T.memref(1, T.i32())
+
+            # AIE Core Function declarations
+
+            vector_min = external_func("vector_min", inputs=[memRef_I_ty, memRef_O_ty])
+
+            scalar_min = external_func("scalar_min", inputs=[memRef_I_ty, memRef_O_ty])
+
+            # Tile declarations
+            ShimTile = tile(int(sys.argv[2]), 0)
+            ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, buffer_depth, memRef_I_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile, buffer_depth, memRef_O_ty
+            )
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2, "vector_min.o")
+            def core_body():
+                for _ in for_(0xFFFFFFFF):
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+
+                    call(
+                        vector_min,
+                        [elem_in, elem_out],
+                    )
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_vector_max()
diff --git a/programming_examples/basic/vector_min_reduce/run.lit b/programming_examples/basic/vector_min_reduce/run.lit
new file mode 100644
index 0000000000..b29f36cc11
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/run.lit
@@ -0,0 +1,13 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
+// RUN: mkdir build
+// RUN: cd build && xchesscc_wrapper -c ../vector_max.cc -o vector_max.o
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/programming_examples/basic/vector_min_reduce/run_vck5000.lit b/programming_examples/basic/vector_min_reduce/run_vck5000.lit
new file mode 100644
index 0000000000..d314eea2a4
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/run_vck5000.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa
+//
+// RUN: %python %S/aie2.py xcvc1902 6 > ./aie.mlir
+// RUN: %python aiecc.py %VitisSysrootFlag% --host-target=%aieHostTargetTriplet% %link_against_hsa% ./aie.mlir -I%host_runtime_lib%/test_lib/include %extraAieCcFlags% %S/test_vck5000.cpp -o test.elf -L%host_runtime_lib%/test_lib/lib -ltest_lib
+// RUN: %run_on_vck5000 ./test.elf
+
diff --git a/programming_examples/basic/vector_min_reduce/test.cpp b/programming_examples/basic/vector_min_reduce/test.cpp
new file mode 100644
index 0000000000..0c596a7068
--- /dev/null
+++ b/programming_examples/basic/vector_min_reduce/test.cpp
@@ -0,0 +1,261 @@
+///===- test.cpp -------------------------------------------000---*- C++
+///-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <bits/stdc++.h>
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+
+using INOUT0_DATATYPE = std::int32_t;
+using INOUT1_DATATYPE = std::int32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  test_utils::add_default_options(desc);
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  // ------------------------------------------------------
+  // Configure this to match your design's buffer size
+  // ------------------------------------------------------
+  int INOUT0_VOLUME = 1024; // Input only, 64x uint32_t in this example
+  int INOUT1_VOLUME = 1;    // Not used in this example
+
+  size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+  size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+
+  // TODO Remove trace for now?
+  size_t OUT_SIZE = INOUT1_SIZE + trace_size;
+
+  srand(time(NULL));
+
+  // Load instruction sequence
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Load the kernel
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node, verbosity](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 if (verbosity >= 1) {
+                                   std::cout << "Name: " << name << std::endl;
+                                 }
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inout0 =
+      xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inout1 =
+      xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  // Initialize instruction buffer
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Initialize Inout buffer 0
+  INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
+  std::int32_t min = (std::int32_t)2147483647;
+  for (int i = 0; i < INOUT0_VOLUME; i++) {
+    std::int32_t next = test_utils::random_int32_t(100000);
+    if (next < min)
+      min = next;
+    bufInOut0[i] = next;
+  }
+  // Initialize Inout buffer 1
+  // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+  // memset(bufInOut1, 0xdeadbeef, OUT_SIZE); // Zeroes out INOUT2_VOLUME +
+  // trace_size
+
+  // Sync buffers to update input buffer values
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // ------------------------------------------------------
+  // Initialize run configs
+  // ------------------------------------------------------
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
+
+  int errors = 0;
+
+  // ------------------------------------------------------
+  // Main run loop
+  // ------------------------------------------------------
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    if (verbosity >= 1) {
+      std::cout << "Running Kernel.\n";
+    }
+
+    // Run kernel
+    if (verbosity >= 1)
+      std::cout << "Running Kernel.\n";
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_inout1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT1_DATATYPE *>();
+
+    if (iter < n_warmup_iterations) {
+      /* Warmup iterations do not count towards average runtime. */
+      continue;
+    }
+
+    // Copy output results and verify they are correct
+    if (do_verify) {
+      if (verbosity >= 1) {
+        std::cout << "Verifying results ..." << std::endl;
+      }
+      auto vstart = std::chrono::system_clock::now();
+      if (bufInOut1[0] != min) {
+        errors++;
+        std::cout << "min is " << min << " calc " << bufInOut1[0] << std::endl;
+      }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      if (verbosity >= 1) {
+        std::cout << "Verify time: " << vtime << "secs." << std::endl;
+      }
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: results not verified." << std::endl;
+    }
+
+    // Write trace values if trace_size > 0
+    if (trace_size > 0) {
+      test_utils::write_out_trace(((char *)bufInOut1) + INOUT1_SIZE, trace_size,
+                                  vm["trace_file"].as<std::string>());
+    }
+
+    // Accumulate run times
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  // ------------------------------------------------------
+  // Print verification and timing results
+  // ------------------------------------------------------
+
+  // TODO - Mac count to guide gflops
+  float macs = 0;
+
+  std::cout << std::endl
+            << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+            << std::endl;
+  if (macs > 0)
+    std::cout << "Avg NPU gflops: "
+              << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU time: " << npu_time_min << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+              << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU time: " << npu_time_max << "us." << std::endl;
+  if (macs > 0)
+    std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+              << std::endl;
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\n";
+    std::cout << "\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/vector_scalar/CMakeLists.txt b/programming_examples/basic/vector_scalar/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/vector_scalar/CMakeLists.txt
+++ b/programming_examples/basic/vector_scalar/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_scalar/Makefile b/programming_examples/basic/vector_scalar/Makefile
index 4cf3d56bbd..688dcfe2cf 100755
--- a/programming_examples/basic/vector_scalar/Makefile
+++ b/programming_examples/basic/vector_scalar/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
diff --git a/programming_examples/basic/vector_softmax/CMakeLists.txt b/programming_examples/basic/vector_softmax/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/vector_softmax/CMakeLists.txt
+++ b/programming_examples/basic/vector_softmax/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_softmax/Makefile b/programming_examples/basic/vector_softmax/Makefile
index e0ba2814ca..eea6b707f5 100755
--- a/programming_examples/basic/vector_softmax/Makefile
+++ b/programming_examples/basic/vector_softmax/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
+include ../../makefile-common
 
 targetname = testExp
 
@@ -17,15 +17,15 @@ build/vecexp.cc: bf16_softmax.mlir
 	cd ${@D} &&	aie-opt ../$< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o vecexp.cc
 
 build/vecexp.o: build/vecexp.cc
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
 build/lut_based_ops.o:
 	mkdir -p ${@D}
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
 
 build/exp.o: exp.cc
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I${REPO_ROOT}/my_install/mlir_aie/aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
 
 build/kernels.a: build/exp.o build/lut_based_ops.o build/vecexp.o
 	ar rvs $@ $+
@@ -42,7 +42,7 @@ build/final.xclbin: build/aie.mlir build/kernels.a
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/basic/vector_sum/CMakeLists.txt b/programming_examples/basic/vector_sum/CMakeLists.txt
index d9f511062f..20452d080e 100644
--- a/programming_examples/basic/vector_sum/CMakeLists.txt
+++ b/programming_examples/basic/vector_sum/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,14 +45,18 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
+set_property(TARGET ${currentTarget} PROPERTY CXX_STANDARD 23)
+
 target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/weight_expand/CMakeLists.txt b/programming_examples/basic/weight_expand/CMakeLists.txt
index d9f511062f..c4ca0825d4 100644
--- a/programming_examples/basic/weight_expand/CMakeLists.txt
+++ b/programming_examples/basic/weight_expand/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,6 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/weight_expand/Makefile b/programming_examples/basic/weight_expand/Makefile
index 9fa9762e11..641b4902b3 100755
--- a/programming_examples/basic/weight_expand/Makefile
+++ b/programming_examples/basic/weight_expand/Makefile
@@ -6,12 +6,12 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../makefile-common
-
-targetname = expand
+include ../../makefile-common
 
 all: build/final.xclbin build/insts.txt ${targetname}.exe
 
+targetname = expand
+
 build/%.o: %.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
@@ -28,7 +28,7 @@ build/final.xclbin: build/aie.mlir build/expand.o
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/basic/makefile-common b/programming_examples/makefile-common
similarity index 100%
rename from programming_examples/basic/makefile-common
rename to programming_examples/makefile-common
diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt
index c64f84842b..c4ca0825d4 100644
--- a/programming_examples/ml/eltwise_add/CMakeLists.txt
+++ b/programming_examples/ml/eltwise_add/CMakeLists.txt
@@ -13,9 +13,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
     set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
@@ -40,6 +45,7 @@ project(${ProjectName})
 find_package(Boost REQUIRED)
 
 add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -48,7 +54,7 @@ target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
 target_include_directories (${currentTarget} PUBLIC 
     ${XRT_INC_DIR}
     ${Boost_INCLUDE_DIRS}
-    ../../../programming_examples/utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
 )
 
 target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile
index dd75274321..ebaf16c2f9 100644
--- a/programming_examples/ml/eltwise_add/Makefile
+++ b/programming_examples/ml/eltwise_add/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../../../programming_examples/basic/makefile-common
+include ../../makefile-common
 
 all: build/final.xclbin
 
@@ -14,7 +14,7 @@ targetname = myEltwiseAdd
 
 build/add.o:
 	mkdir -p ${@D}
-	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ${REPO_ROOT}/aie_kernels/aie2/add.cc -o ${@F}
+	cd ${@D} &&	xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/add.cc -o ${@F}
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}
@@ -28,8 +28,7 @@ build/final.xclbin: build/aie.mlir build/add.o
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-#	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake .. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/utils/makefile-common b/programming_examples/utils/makefile-common
deleted file mode 100644
index bca449a071..0000000000
--- a/programming_examples/utils/makefile-common
+++ /dev/null
@@ -1,22 +0,0 @@
-# Contains common definitions used across the Makefiles of ipu-xrt tests.
-REPO_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/../../..)
-
-# VITIS related variables
-VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
-VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
-VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib
-VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib
-
-CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include
-CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
-CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR}
-
-CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include 
-CHESSCCWRAP2_FLAGS = aie2 -I ${VITIS_AIETOOLS_DIR}/include 
-
-TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no)
-ifeq ($(TEST_POWERSHELL),yes)
-	powershell = powershell.exe
-else
-	powershell = 
-endif
diff --git a/programming_examples/vision/color_detect/CMakeLists.txt b/programming_examples/vision/color_detect/CMakeLists.txt
index c92b647d43..be4d1e64c4 100644
--- a/programming_examples/vision/color_detect/CMakeLists.txt
+++ b/programming_examples/vision/color_detect/CMakeLists.txt
@@ -14,9 +14,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
@@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}")
 
 
 add_executable(${currentTarget}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC
 
 target_include_directories (${currentTarget} PUBLIC 
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
     ${XRT_INC_DIR}
     ${OpenCV_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIRS}
diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile
index d57db0abe7..3311c71a9d 100755
--- a/programming_examples/vision/color_detect/Makefile
+++ b/programming_examples/vision/color_detect/Makefile
@@ -46,7 +46,8 @@ build/${targetname}.exe: test.cpp
 	mkdir -p ${@D}
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORDETECT_WIDTH=${COLORDETECT_WIDTH} -DCOLORDETECT_HEIGHT=${COLORDETECT_HEIGHT} 
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORDETECT_WIDTH=${COLORDETECT_WIDTH} -DCOLORDETECT_HEIGHT=${COLORDETECT_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13
+
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/vision/color_detect/test.cpp b/programming_examples/vision/color_detect/test.cpp
index 36762c4197..a2eac7ae3f 100644
--- a/programming_examples/vision/color_detect/test.cpp
+++ b/programming_examples/vision/color_detect/test.cpp
@@ -162,8 +162,9 @@ int main(int argc, const char *argv[]) {
   xrt::device device;
   xrt::kernel kernel;
 
-  initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
-                    vm["kernel"].as<std::string>());
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
 
   /*
    ****************************************************************************
diff --git a/programming_examples/vision/color_threshold/CMakeLists.txt b/programming_examples/vision/color_threshold/CMakeLists.txt
index bf67da2180..d04bc7d451 100644
--- a/programming_examples/vision/color_threshold/CMakeLists.txt
+++ b/programming_examples/vision/color_threshold/CMakeLists.txt
@@ -14,9 +14,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
@@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}")
 
 
 add_executable(${currentTarget}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC
 
 target_include_directories (${currentTarget} PUBLIC 
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
     ${XRT_INC_DIR}
     ${OpenCV_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIRS}
diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile
index cbd478b3d9..591b93de7f 100644
--- a/programming_examples/vision/color_threshold/Makefile
+++ b/programming_examples/vision/color_threshold/Makefile
@@ -42,7 +42,7 @@ build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.m
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORTHRESHOLD_WIDTH=${COLORTHRESHOLD_WIDTH} -DCOLORTHRESHOLD_HEIGHT=${COLORTHRESHOLD_HEIGHT}
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DCOLORTHRESHOLD_WIDTH=${COLORTHRESHOLD_WIDTH} -DCOLORTHRESHOLD_HEIGHT=${COLORTHRESHOLD_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/vision/color_threshold/test.cpp b/programming_examples/vision/color_threshold/test.cpp
index 66b8177fe7..9759948d2b 100644
--- a/programming_examples/vision/color_threshold/test.cpp
+++ b/programming_examples/vision/color_threshold/test.cpp
@@ -71,19 +71,6 @@ int main(int argc, const char *argv[]) {
     po::store(po::parse_command_line(argc, argv, desc), vm);
     po::notify(vm);
 
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
-
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
-
   /*
    ****************************************************************************
    * Load instruction sequence
@@ -104,8 +91,9 @@ int main(int argc, const char *argv[]) {
   xrt::device device;
   xrt::kernel kernel;
 
-  initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
-                    vm["kernel"].as<std::string>());
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
 
   /*
    ****************************************************************************
diff --git a/programming_examples/vision/edge_detect/CMakeLists.txt b/programming_examples/vision/edge_detect/CMakeLists.txt
index 6e7bb43b66..36f2a43728 100644
--- a/programming_examples/vision/edge_detect/CMakeLists.txt
+++ b/programming_examples/vision/edge_detect/CMakeLists.txt
@@ -14,9 +14,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
@@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}")
 
 
 add_executable(${currentTarget}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -63,6 +68,7 @@ target_compile_definitions(${currentTarget} PUBLIC
 
 target_include_directories (${currentTarget} PUBLIC 
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
     ${XRT_INC_DIR}
     ${OpenCV_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIRS}
diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile
index a0d50006e1..1524daa510 100755
--- a/programming_examples/vision/edge_detect/Makefile
+++ b/programming_examples/vision/edge_detect/Makefile
@@ -45,7 +45,7 @@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WID
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DEDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} -DEDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} 
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DEDGEDETECT_WIDTH=${EDGEDETECT_WIDTH} -DEDGEDETECT_HEIGHT=${EDGEDETECT_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/vision/edge_detect/test.cpp b/programming_examples/vision/edge_detect/test.cpp
index 57389fffbf..27b68aa3d4 100644
--- a/programming_examples/vision/edge_detect/test.cpp
+++ b/programming_examples/vision/edge_detect/test.cpp
@@ -198,8 +198,9 @@ int main(int argc, const char *argv[]) {
     xrt::device device;
     xrt::kernel kernel;
 
-    initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
-                      vm["kernel"].as<std::string>());
+    test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                     vm["xclbin"].as<std::string>(),
+                                     vm["kernel"].as<std::string>());
 
     /*
      ****************************************************************************
diff --git a/programming_examples/vision/passthrough/CMakeLists.txt b/programming_examples/vision/passthrough/CMakeLists.txt
index 8493494697..0adfebde01 100644
--- a/programming_examples/vision/passthrough/CMakeLists.txt
+++ b/programming_examples/vision/passthrough/CMakeLists.txt
@@ -14,9 +14,14 @@
 # cmake needs this line
 cmake_minimum_required(VERSION 3.1)
 
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
 find_program(WSL NAMES powershell.exe)
 
 if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
     set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
     set(OpenCV_DIR /usr/include/opencv4 CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
@@ -50,8 +55,8 @@ message("opencv libs: ${OpenCV_LIBS}")
 
 
 add_executable(${currentTarget}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/OpenCVUtils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/xrtUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/OpenCVUtils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
     test.cpp
 )
 
@@ -62,7 +67,8 @@ target_compile_definitions(${currentTarget} PUBLIC
         )
 
 target_include_directories (${currentTarget} PUBLIC 
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
     ${XRT_INC_DIR}
     ${OpenCV_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIRS}
diff --git a/programming_examples/vision/passthrough/Makefile b/programming_examples/vision/passthrough/Makefile
index 55ccefd696..b9d405f85f 100644
--- a/programming_examples/vision/passthrough/Makefile
+++ b/programming_examples/vision/passthrough/Makefile
@@ -8,7 +8,7 @@
 
 include ../../makefile-common
 
-VPATH := ../vision_kernels
+VPATH := ../../../aie_kernels/aie_generic
 
 PASSTHROUGH_WIDTH = 1920
 PASSTHROUGH_HEIGHT = 1080
@@ -38,7 +38,7 @@ build/final_${PASSTHROUGH_WIDTH}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_W
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_WIDTH=${PASSTHROUGH_WIDTH} -DPASSTHROUGH_HEIGHT=${PASSTHROUGH_HEIGHT} 
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} -DPASSTHROUGH_WIDTH=${PASSTHROUGH_WIDTH} -DPASSTHROUGH_HEIGHT=${PASSTHROUGH_HEIGHT} -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/vision/passthrough/run.lit b/programming_examples/vision/passthrough/run.lit
index 8d17f1386b..a3cff036a8 100644
--- a/programming_examples/vision/passthrough/run.lit
+++ b/programming_examples/vision/passthrough/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, opencv, chess
 // ALLOW_RETRIES: 3
 //
-// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../vision_kernels/passThrough.cc -o passThrough.cc.o
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie_generic/passThrough.cc -o passThrough.cc.o
 // RUN: %python %S/aie2.py 1920 1080 | aie-opt -cse -canonicalize -o ./aie.mlir
 // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -DPASSTHROUGH_WIDTH=1920 -DPASSTHROUGH_HEIGHT=1080 -I%S/../../utils %S/../../utils/xrtUtils.cpp %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags  -lrt -lstdc++ -lboost_program_options -lboost_filesystem
diff --git a/programming_examples/vision/passthrough/test.cpp b/programming_examples/vision/passthrough/test.cpp
index ff4ceb49c2..b5136fdfc1 100644
--- a/programming_examples/vision/passthrough/test.cpp
+++ b/programming_examples/vision/passthrough/test.cpp
@@ -108,8 +108,9 @@ int main(int argc, const char *argv[]) {
   xrt::device device;
   xrt::kernel kernel;
 
-  initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
-                    vm["kernel"].as<std::string>());
+  test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+                                   vm["xclbin"].as<std::string>(),
+                                   vm["kernel"].as<std::string>());
 
   // set up the buffer objects
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
diff --git a/programming_examples/utils/test_utils.py b/python/test_utils.py
similarity index 99%
rename from programming_examples/utils/test_utils.py
rename to python/test_utils.py
index 5e187f6330..c4b7086031 100644
--- a/programming_examples/utils/test_utils.py
+++ b/python/test_utils.py
@@ -5,7 +5,6 @@
 
 import argparse
 
-
 # options
 def parse_args(args):
     p = argparse.ArgumentParser()
diff --git a/programming_examples/utils/test_utils.h b/runtime_lib/test_lib/test_utils.cpp
similarity index 53%
rename from programming_examples/utils/test_utils.h
rename to runtime_lib/test_lib/test_utils.cpp
index 8eedcde857..320a60a858 100644
--- a/programming_examples/utils/test_utils.h
+++ b/runtime_lib/test_lib/test_utils.cpp
@@ -1,4 +1,4 @@
-//===- test_utils.h ----------------------------000---*- C++ -*-===//
+//===- test_utils.cpp ----------------------------000---*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,21 +10,14 @@
 
 // This file contains common helper functions for the generic host code
 
-#ifndef TEST_UTILS_H
-#define TEST_UTILS_H
-
-#include <boost/program_options.hpp>
-#include <cmath>
-
-namespace test_utils {
-
-namespace po = boost::program_options;
+#include "test_utils.h"
 
 // --------------------------------------------------------------------------
 // Command Line Argument Handling
 // --------------------------------------------------------------------------
 
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+void test_utils::check_arg_file_exists(po::variables_map &vm_in,
+                                       std::string name) {
   if (!vm_in.count(name)) {
     throw std::runtime_error("Error: no " + name + " file was provided\n");
   } else {
@@ -37,7 +30,7 @@ void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
   }
 }
 
-void add_default_options(po::options_description &desc) {
+void test_utils::add_default_options(po::options_description &desc) {
   desc.add_options()("help,h", "produce help message")(
       "xclbin,x", po::value<std::string>()->required(),
       "the input xclbin path")(
@@ -56,8 +49,9 @@ void add_default_options(po::options_description &desc) {
       "where to store trace output");
 }
 
-void parse_options(int argc, const char *argv[], po::options_description &desc,
-                   po::variables_map &vm) {
+void test_utils::parse_options(int argc, const char *argv[],
+                               po::options_description &desc,
+                               po::variables_map &vm) {
   try {
     po::store(po::parse_command_line(argc, argv, desc), vm);
     po::notify(vm);
@@ -72,15 +66,19 @@ void parse_options(int argc, const char *argv[], po::options_description &desc,
     std::exit(1);
   }
 
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
+  try {
+    check_arg_file_exists(vm, "xclbin");
+    check_arg_file_exists(vm, "instr");
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+  }
 }
 
 // --------------------------------------------------------------------------
 // AIE Specifics
 // --------------------------------------------------------------------------
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+std::vector<uint32_t> test_utils::load_instr_sequence(std::string instr_path) {
   std::ifstream instr_file(instr_path);
   std::string line;
   std::vector<uint32_t> instr_v;
@@ -96,24 +94,63 @@ std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
 }
 
 // --------------------------------------------------------------------------
-// Matrix / Float / Math
+// XRT
 // --------------------------------------------------------------------------
-
-static inline std::int16_t random_int16_t() {
-  return (std::int16_t)rand() % 0x10000;
+void test_utils::init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel,
+                                      int verbosity, std::string xclbinFileName,
+                                      std::string kernelNameInXclbin) {
+  // Get a device handle
+  unsigned int device_index = 0;
+  device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << xclbinFileName << "\n";
+  auto xclbin = xrt::xclbin(xclbinFileName);
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << kernelNameInXclbin << "\n";
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel =
+      *std::find_if(xkernels.begin(), xkernels.end(),
+                    [kernelNameInXclbin, verbosity](xrt::xclbin::kernel &k) {
+                      auto name = k.get_name();
+                      if (verbosity >= 1) {
+                        std::cout << "Name: " << name << std::endl;
+                      }
+                      return name.rfind(kernelNameInXclbin, 0) == 0;
+                    });
+  auto kernelName = xkernel.get_name();
+
+  // Register xclbin
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << xclbinFileName << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // Get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // Get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  kernel = xrt::kernel(context, kernelName);
+
+  return;
 }
 
-// static inline std::bfloat16_t random_bfloat16_t() {
-//   // Random numbers should NOT be uniformly between 0 and 1, because that
-//   // would make the matrix product AB always close to 1.
-//   return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
-// }
+// --------------------------------------------------------------------------
+// Matrix / Float / Math
+// --------------------------------------------------------------------------
 
 // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
 // Original author: P-Gn
 // Source: https://stackoverflow.com/a/32334103
-bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
-                  float abs_th = FLT_MIN)
+bool test_utils::nearly_equal(float a, float b, float epsilon, float abs_th)
 // those defaults are arbitrary and could be removed
 {
   assert(std::numeric_limits<float>::epsilon() <= epsilon);
@@ -131,76 +168,15 @@ bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
   return diff < std::max(abs_th, epsilon * norm);
 }
 
-template <typename T>
-void print_matrix(const std::vector<T> matrix, int n_cols,
-                  int n_printable_rows = 10, int n_printable_cols = 10,
-                  std::ostream &ostream = std::cout,
-                  const char col_sep[] = "  ", const char elide_sym[] = " ... ",
-                  int w = -1) {
-  assert(matrix.size() % n_cols == 0);
-
-  auto maxima = std::minmax_element(matrix.begin(), matrix.end());
-  T max_val = std::max(*maxima.first, std::abs(*maxima.second));
-  size_t n_digits = log10(max_val);
-  if (w == -1) {
-    w = n_digits;
-  }
-  int n_rows = matrix.size() / n_cols;
-
-  n_printable_rows = std::min(n_rows, n_printable_rows);
-  n_printable_cols = std::min(n_cols, n_printable_cols);
-
-  const bool elide_rows = n_printable_rows < n_rows;
-  const bool elide_cols = n_printable_cols < n_cols;
-
-  if (elide_rows || elide_cols) {
-    w = std::max((int)w, (int)strlen(elide_sym));
-  }
-
-  w += 3; // for decimal point and two decimal digits
-  ostream << std::fixed << std::setprecision(2);
-
-#define print_row(what)                                                        \
-  for (int col = 0; col < n_printable_cols / 2; col++) {                       \
-    ostream << std::right << std::setw(w) << (what);                           \
-    ostream << std::setw(0) << col_sep;                                        \
-  }                                                                            \
-  if (elide_cols) {                                                            \
-    ostream << std::setw(0) << elide_sym;                                      \
-  }                                                                            \
-  for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) {    \
-    ostream << std::right << std::setw(w) << (what);                           \
-    ostream << std::setw(0) << col_sep;                                        \
-  }
-
-  for (int row = 0; row < n_printable_rows / 2; row++) {
-    print_row(matrix[row * n_rows + col]);
-    ostream << std::endl;
-  }
-  if (elide_rows) {
-    print_row(elide_sym);
-    ostream << std::endl;
-  }
-  for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) {
-    print_row(matrix[row * n_rows + col]);
-    ostream << std::endl;
-  }
-
-#undef print_row
-}
-
 // --------------------------------------------------------------------------
 // Tracing
 // --------------------------------------------------------------------------
-void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path) {
+void test_utils::write_out_trace(char *traceOutPtr, size_t trace_size,
+                                 std::string path) {
   std::ofstream fout(path);
   uint32_t *traceOut = (uint32_t *)traceOutPtr;
   for (int i = 0; i < trace_size / sizeof(traceOut[0]); i++) {
     fout << std::setfill('0') << std::setw(8) << std::hex << (int)traceOut[i];
     fout << std::endl;
   }
-}
-
-} // namespace test_utils
-
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/runtime_lib/test_lib/test_utils.h b/runtime_lib/test_lib/test_utils.h
new file mode 100644
index 0000000000..ae094f0fcf
--- /dev/null
+++ b/runtime_lib/test_lib/test_utils.h
@@ -0,0 +1,119 @@
+//===- test_utils.h ----------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// This file contains common helper functions for the generic host code
+
+#ifndef _TEST_UTILS_H_
+#define _TEST_UTILS_H_
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdfloat>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <cmath>
+
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+namespace po = boost::program_options;
+
+namespace test_utils {
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name);
+
+void add_default_options(po::options_description &desc);
+
+void parse_options(int argc, const char *argv[], po::options_description &desc,
+                   po::variables_map &vm);
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path);
+
+void init_xrt_load_kernel(xrt::device &device, xrt::kernel &kernel,
+                          int verbosity, std::string xclbinFileName,
+                          std::string kernelNameInXclbin);
+
+static inline std::int16_t random_int16_t();
+
+static inline std::bfloat16_t random_bfloat16_t(std::bfloat16_t scale,
+                                                std::bfloat16_t bias) {
+  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) + bias);
+}
+
+bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
+                  float abs_th = FLT_MIN);
+
+template <typename T>
+void print_matrix(const std::vector<T> matrix, int n_cols,
+                  int n_printable_rows = 10, int n_printable_cols = 10,
+                  std::ostream &ostream = std::cout,
+                  const char col_sep[] = "  ", const char elide_sym[] = " ... ",
+                  int w = -1) {
+  assert(matrix.size() % n_cols == 0);
+
+  auto maxima = std::minmax_element(matrix.begin(), matrix.end());
+  T max_val = std::max(*maxima.first, std::abs(*maxima.second));
+  size_t n_digits = log10(max_val);
+  if (w == -1) {
+    w = n_digits;
+  }
+  int n_rows = matrix.size() / n_cols;
+
+  n_printable_rows = std::min(n_rows, n_printable_rows);
+  n_printable_cols = std::min(n_cols, n_printable_cols);
+
+  const bool elide_rows = n_printable_rows < n_rows;
+  const bool elide_cols = n_printable_cols < n_cols;
+
+  if (elide_rows || elide_cols) {
+    w = std::max((int)w, (int)strlen(elide_sym));
+  }
+
+  w += 3; // for decimal point and two decimal digits
+  ostream << std::fixed << std::setprecision(2);
+
+#define print_row(what)                                                        \
+  for (int col = 0; col < n_printable_cols / 2; col++) {                       \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }                                                                            \
+  if (elide_cols) {                                                            \
+    ostream << std::setw(0) << elide_sym;                                      \
+  }                                                                            \
+  for (int col = n_printable_cols / 2 + 1; col < n_printable_cols; col++) {    \
+    ostream << std::right << std::setw(w) << (what);                           \
+    ostream << std::setw(0) << col_sep;                                        \
+  }
+
+  for (int row = 0; row < n_printable_rows / 2; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+  if (elide_rows) {
+    print_row(elide_sym);
+    ostream << std::endl;
+  }
+  for (int row = n_printable_rows / 2 + 1; row < n_printable_rows; row++) {
+    print_row(matrix[row * n_rows + col]);
+    ostream << std::endl;
+  }
+
+#undef print_row
+}
+
+void write_out_trace(char *traceOutPtr, size_t trace_size, std::string path);
+
+} // namespace test_utils
+
+#endif // _TEST_UTILS_H_
\ No newline at end of file
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit
index 86b773fb81..ac347dcce6 100644
--- a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit
+++ b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit
@@ -8,4 +8,3 @@
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
 // RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
-`
\ No newline at end of file