diff --git a/client.cpp b/client.cpp
index 3d5bff2..14def33 100644
--- a/client.cpp
+++ b/client.cpp
@@ -18,9 +18,17 @@
 #include <cuda.h>
 #include <sys/uio.h>
 #include <netinet/tcp.h>
+#include <cuda_runtime.h>
 
 #include <unordered_map>
 
+#include <setjmp.h>
+#include <signal.h>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+#include <sys/mman.h>
+
 #include "codegen/gen_client.h"
 
 typedef struct
@@ -34,6 +42,8 @@ typedef struct
     pthread_cond_t read_cond;
     struct iovec write_iov[128];
     int write_iov_count = 0;
+
+    std::unordered_map<void*, size_t> unified_devices;
 } conn_t;
 
 pthread_mutex_t conn_mutex;
@@ -42,8 +52,76 @@ int nconns = 0;
 
 const char *DEFAULT_PORT = "14833";
 
+static int init = 0;
+static jmp_buf catch_segfault;
+static void* faulting_address = nullptr;
+
+static void segfault(int sig, siginfo_t* info, void* unused) {
+    faulting_address = info->si_addr; 
+
+    std::cout << "Caught segfault at address: " << faulting_address << std::endl;
+
+    for (const auto & [ ptr, sz ] : conns[0].unified_devices)
+    {
+        if (ptr <= faulting_address && faulting_address < (ptr + sz))
+        {
+            // ensure we assign memory as close to the faulting address as possible...
+            // by masking via the allocated unified memory size.
+            void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));
+
+            // Allocate memory at the faulting address
+            void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+            if (allocated == MAP_FAILED) {
+                perror("Failed to allocate memory at faulting address");
+                _exit(1);
+            }
+
+            std::cout << "allocated dynamic memory at address: " << allocated << std::endl;
+
+            return;
+        }
+    }
+
+    // raise our original segfault handler
+    struct sigaction sa;
+    sa.sa_handler = SIG_DFL;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+
+    if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
+        perror("Failed to reset SIGSEGV handler");
+        _exit(EXIT_FAILURE);
+    }
+
+    raise(SIGSEGV);
+}
+
+static void set_segfault_handlers() {
+    if (init > 0) {
+        return;
+    }
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_sigaction = segfault;
+
+    if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+        perror("sigaction");
+        exit(EXIT_FAILURE);
+    }
+
+    std::cout << "Segfault handler installed." << std::endl;
+
+    init = 1;
+}
+
 int rpc_open()
 {
+    set_segfault_handlers();
+
+    sigsetjmp(catch_segfault, 1);
+
     if (pthread_mutex_lock(&conn_mutex) < 0)
         return -1;
 
@@ -220,6 +298,39 @@ int rpc_read(const int index, void *data, size_t size)
     return n;
 }
 
+void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
+{
+    // allocate new space for pointer mapping
+    conns[index].unified_devices.insert({ dev_ptr, size });
+}
+
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
+{
+    for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
+        size_t size = reinterpret_cast<size_t>(sz);
+
+        // ptr is the same on both host/device
+        cudaError_t res = cudaMemcpy(ptr, ptr, size, kind);
+        if (res != cudaSuccess) {
+            std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
+        } else {
+            std::cout << "Successfully copied " << size << " bytes" << std::endl;
+        }
+    }
+}
+
+void maybe_free_unified_mem(const int index, void *ptr)
+{
+    for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
+        size_t size = reinterpret_cast<size_t>(sz);
+
+        if (dev_ptr == ptr) {
+            munmap(dev_ptr, size);
+            return;
+        }
+    }
+}
+
 int rpc_end_response(const int index, void *result)
 {
     if (read(conns[index].connfd, result, sizeof(int)) < 0 ||
diff --git a/codegen/annotations.h b/codegen/annotations.h
index ddaf798..42fafff 100644
--- a/codegen/annotations.h
+++ b/codegen/annotations.h
@@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
  */
 cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
 /**
+ * @disabled
  * @param devPtr SEND_RECV
  * @param size SEND_ONLY
  * @param flags SEND_ONLY
@@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
  */
 cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
 /**
+ * @disabled
  * @param devPtr SEND_ONLY
  */
 cudaError_t cudaFree(void *devPtr);
diff --git a/codegen/codegen.py b/codegen/codegen.py
index 65bc104..14d7b84 100644
--- a/codegen/codegen.py
+++ b/codegen/codegen.py
@@ -69,9 +69,11 @@
 # a list of manually implemented cuda/nvml functions.
 # these are automatically appended to each file; operation order is maintained as well.
 MANUAL_IMPLEMENTATIONS = [
+    "cudaFree",
     "cudaMemcpy",
     "cudaMemcpyAsync",
     "cudaLaunchKernel",
+    "cudaMallocManaged",
 ]
 
 @dataclass
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
index b608b62..03e538b 100644
--- a/codegen/gen_client.cpp
+++ b/codegen/gen_client.cpp
@@ -9112,20 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
     return return_value;
 }
 
-cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
-{
-    cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
-        rpc_write(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_write(0, &size, sizeof(size_t)) < 0 ||
-        rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_read(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
-    return return_value;
-}
-
 cudaError_t cudaMalloc(void** devPtr, size_t size)
 {
     cudaError_t return_value;
@@ -9183,17 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
     return return_value;
 }
 
-cudaError_t cudaFree(void* devPtr)
-{
-    cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaFree) < 0 ||
-        rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
-    return return_value;
-}
-
 cudaError_t cudaFreeHost(void* ptr)
 {
     cudaError_t return_value;
@@ -22154,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
     {"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
     {"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
-    {"cudaMallocManaged", (void *)cudaMallocManaged},
     {"cudaMalloc", (void *)cudaMalloc},
     {"cudaMallocHost", (void *)cudaMallocHost},
     {"cudaMallocPitch", (void *)cudaMallocPitch},
     {"cudaMallocArray", (void *)cudaMallocArray},
-    {"cudaFree", (void *)cudaFree},
     {"cudaFreeHost", (void *)cudaFreeHost},
     {"cudaFreeArray", (void *)cudaFreeArray},
     {"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
@@ -22852,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
     {"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
     {"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
+    {"cudaFree", (void *)cudaFree},
     {"cudaMemcpy", (void *)cudaMemcpy},
     {"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
     {"cudaLaunchKernel", (void *)cudaLaunchKernel},
+    {"cudaMallocManaged", (void *)cudaMallocManaged},
 };
 
 void *get_function_pointer(const char *name)
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
index ab04e00..9a6cb8b 100644
--- a/codegen/gen_server.cpp
+++ b/codegen/gen_server.cpp
@@ -19387,35 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
     return -1;
 }
 
-int handle_cudaMallocManaged(void *conn)
-{
-    void* devPtr;
-    size_t size;
-    unsigned int flags;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_read(conn, &size, sizeof(size_t)) < 0 ||
-        rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
-        false)
-        goto ERROR_0;
-
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaMalloc(void *conn)
 {
     void* devPtr;
@@ -19534,30 +19505,6 @@ int handle_cudaMallocArray(void *conn)
     return -1;
 }
 
-int handle_cudaFree(void *conn)
-{
-    void* devPtr;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        false)
-        goto ERROR_0;
-
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaFree(devPtr);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaFreeHost(void *conn)
 {
     void* ptr;
diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
index 02643cd..5f4aca6 100755
--- a/codegen/manual_client.cpp
+++ b/codegen/manual_client.cpp
@@ -24,6 +24,9 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
+void* maybe_free_unified_mem(const int index, void *ptr);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 #define MAX_FUNCTION_NAME 1024
 #define MAX_ARGS 128
@@ -337,6 +340,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 {
     cudaError_t return_value;
 
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
+
     // Start the RPC request
     int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
     if (request_id < 0)
@@ -395,6 +400,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
         return cudaErrorDevicesUnavailable;
     }
 
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
+
     return return_value;
 }
 
@@ -483,8 +490,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
                         if (type_size == 0)
                             continue;
                         arg_size = type_size;
-
-                        std::cout << "arg size: " << arg_size << std::endl;
                     }
                     else if (ptx_string[i] == '[')
                     {
@@ -705,6 +710,8 @@ extern "C"
     {
         void *return_value;
 
+        std::cout << "calling __cudaRegisterVar" << std::endl;
+
         // Start the RPC request
         int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
         if (request_id < 0)
@@ -792,3 +799,36 @@ extern "C"
         }
     }
 }
+
+cudaError_t cudaFree(void* devPtr)
+{
+    cudaError_t return_value;
+    maybe_free_unified_mem(0, devPtr);
+
+    if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+        rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return cudaErrorDevicesUnavailable;
+    
+    return return_value;
+}
+
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
+{
+    void* d_mem;
+
+    cudaError_t err = cudaMalloc((void**)&d_mem, size);
+    if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        return err;
+    }
+
+    std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl;
+
+    allocate_unified_mem_pointer(0, d_mem, size);
+
+    *devPtr = d_mem;
+
+    return cudaSuccess;
+}
diff --git a/codegen/manual_client.h b/codegen/manual_client.h
index 1666ce3..a10bfcb 100644
--- a/codegen/manual_client.h
+++ b/codegen/manual_client.h
@@ -3,6 +3,8 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
+cudaError_t cudaFree(void* devPtr);
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags);
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
 cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp
index 3054b57..6322cdb 100755
--- a/codegen/manual_server.cpp
+++ b/codegen/manual_server.cpp
@@ -240,6 +240,8 @@ int handle_cudaLaunchKernel(void *conn)
 
     result = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
 
+    std::cout << "Launch kern result: " << result << std::endl;
+
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_end_response(conn, &result) < 0)
         goto ERROR_1;
@@ -570,3 +572,56 @@ int handle___cudaRegisterVar(void *conn)
 
     return 0;
 }
+
+int handle_cudaFree(void *conn)
+{
+    void* devPtr;
+    int request_id;
+    cudaError_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cudaFree(devPtr);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
+int handle_cudaMallocManaged(void *conn)
+{
+    void* devPtr;
+    size_t size;
+    unsigned int flags;
+    int request_id;
+    cudaError_t scuda_intercept_result;
+    if (
+        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
+        rpc_read(conn, &size, sizeof(size_t)) < 0 ||
+        rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
+        rpc_end_response(conn, &scuda_intercept_result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
diff --git a/codegen/manual_server.h b/codegen/manual_server.h
index 11e4f11..4449dea 100644
--- a/codegen/manual_server.h
+++ b/codegen/manual_server.h
@@ -2,9 +2,11 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+int handle_cudaFree(void *conn);
 int handle_cudaMemcpy(void *conn);
 int handle_cudaMemcpyAsync(void *conn);
 int handle_cudaLaunchKernel(void *conn);
+int handle_cudaMallocManaged(void *conn);
 int handle___cudaRegisterVar(void *conn);
 int handle___cudaRegisterFunction(void *conn);
 int handle___cudaRegisterFatBinary(void *conn);
diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
new file mode 100644
index 0000000..16b64b1
--- /dev/null
+++ b/deploy/Dockerfile.unified
@@ -0,0 +1,37 @@
+FROM ubuntu:24.04
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    curl \
+    python3 \
+    python3-pip \
+    gnupg \
+    software-properties-common && \
+    add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \
+    apt-get update && \
+    apt-get install -y libtinfo5 && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm cuda-keyring_1.0-1_all.deb && \
+    apt-get update
+
+RUN apt-get install -y cuda-toolkit-12-2
+
+ENV PATH=/usr/local/cuda-12.2/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
+
+ENV SCUDA_SERVER=71.183.65.76
+ENV libscuda_path=/usr/local/lib/libscuda.so
+
+COPY ./libscuda.so /usr/local/lib/libscuda.so
+COPY unified.o unified.o
+COPY unified_pointer.o unified_pointer.o
+
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+RUN chmod +x /unified.o
+
+CMD ["/bin/bash", "/start.sh", "unified"]
\ No newline at end of file
diff --git a/deploy/start.sh b/deploy/start.sh
index aa66e39..d24f11e 100644
--- a/deploy/start.sh
+++ b/deploy/start.sh
@@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then
     echo "Running cublas example..."
 
     LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS
+elif [[ "$1" == "unified" ]]; then
+    echo "Running cublas example..."
+
+    LD_PRELOAD="$libscuda_path" /unified_pointer.o
 else
-    echo "Unknown option: $1. Please specify 'torch' or 'cublas'."
+    echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
\ No newline at end of file
diff --git a/local.sh b/local.sh
index caa9c70..1447676 100755
--- a/local.sh
+++ b/local.sh
@@ -26,8 +26,10 @@ build() {
 
   nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
-
   nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
@@ -163,6 +165,17 @@ test_cublas_batched() {
   fi
 }
 
+test_unified_mem() {
+  output=$(LD_PRELOAD="$libscuda_path" ./unified_pointer.o | tail -n 1)
+
+  if [[ "$output" == "Max error: 0" ]]; then
+    ansi_format "pass" "$pass_message"
+  else
+    ansi_format "fail" "vector_add failed. Got [$output]."
+    return 1
+  fi
+}
+
 #---- declare test cases ----#
 declare -A test_cuda_avail=(
   ["function"]="test_cuda_available"
@@ -194,8 +207,13 @@ declare -A test_cublas_batched=(
   ["pass"]="Batched cublas works via test/cublas_batched.cu."
 )
 
+declare -A test_unified_mem=(
+  ["function"]="test_unified_mem"
+  ["pass"]="Unified memory works as expected."
+)
+
 #---- assign them to our associative array ----#
-tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn" "test_cublas_batched")
+tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn" "test_cublas_batched" "test_unified_mem")
 
 test() {
   build
diff --git a/test/unified.cu b/test/unified.cu
new file mode 100644
index 0000000..60adcad
--- /dev/null
+++ b/test/unified.cu
@@ -0,0 +1,63 @@
+#include <cuda_runtime.h>
+#include <iostream>
+
+// CUDA Kernel to add elements of two arrays
+// __global__ void addKernel(int *a, int *b, int *c, int size) {
+//     int idx = threadIdx.x + blockIdx.x * blockDim.x;
+//     if (idx < size) {
+//         c[idx] = a[idx] * b[idx];
+//     }
+// }
+
+__global__ void mulKernel(int *a, int *c, int size) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < size) {
+        c[idx] = a[idx] * 100;
+    }
+}
+
+int main() {
+
+    // Define array size
+    const int size = 10;
+    const int bytes = size * sizeof(int);
+
+    // Unified memory allocation
+    int *a, *c;
+
+    cudaMallocManaged(&c, bytes);
+    cudaMallocManaged(&a, bytes);
+
+    // Initialize arrays on the CPU
+    for (int i = 0; i < size; ++i) {
+        a[i] = i;
+    }
+
+    // Define kernel launch parameters
+    const int threadsPerBlock = 256;
+    const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+    std::cout << "launching kernel..." << std::endl;
+
+    for (int i = 0; i < size; ++i) {
+        std::cout << "a[" << i << "] + b[" << i << "] = " << a[i] << "\n";
+    }
+
+    // Launch the kernel
+    mulKernel<<<blocks, threadsPerBlock>>>(a, c, size);
+
+    // Wait for GPU to finish
+    cudaDeviceSynchronize();
+
+    // Display results
+    std::cout << "Results:\n";
+    for (int i = 0; i < size; ++i) {
+        std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n";
+    }
+
+    // Free unified memory
+    cudaFree(a);
+    cudaFree(c);
+
+    return 0;
+}
diff --git a/test/unified_linked.cu b/test/unified_linked.cu
new file mode 100644
index 0000000..3bda6f2
--- /dev/null
+++ b/test/unified_linked.cu
@@ -0,0 +1,54 @@
+#include <cstdio>
+#include <cstdlib>
+// error checking macro
+#define cudaCheckErrors(msg) \
+    do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
+                msg, cudaGetErrorString(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while (0)
+
+struct list_elem {
+  int key;
+  list_elem *next;
+};
+
+template <typename T>
+void alloc_bytes(T &ptr, size_t num_bytes){
+
+  cudaMallocManaged(&ptr, num_bytes);
+}
+
+__host__ __device__
+void print_element(list_elem *list, int ele_num){
+  list_elem *elem = list;
+  for (int i = 0; i < ele_num; i++)
+    elem = elem->next;
+  printf("key = %d\n", elem->key);
+}
+
+__global__ void gpu_print_element(list_elem *list, int ele_num){
+  print_element(list, ele_num);
+}
+
+const int num_elem = 5;
+const int ele = 3;
+int main(){
+
+  list_elem *list_base, *list;
+  alloc_bytes(list_base, sizeof(list_elem));
+  list = list_base;
+  for (int i = 0; i < num_elem; i++){
+    list->key = i;
+    alloc_bytes(list->next, sizeof(list_elem));
+    list = list->next;}
+  print_element(list_base, ele);
+  gpu_print_element<<<1,1>>>(list_base, ele);
+  cudaDeviceSynchronize();
+  cudaCheckErrors("cuda error!");
+}
\ No newline at end of file
diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu
new file mode 100644
index 0000000..1e39104
--- /dev/null
+++ b/test/unified_pointer.cu
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <math.h>
+
+struct Operation {
+    float *x;
+    float *y;
+    int n;
+};
+
+// CUDA kernel to add elements of two arrays
+__global__ void add(Operation *op) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+
+    printf("The X is: %x\n", op->x[0]);
+    printf("The Y is: %x\n", op->y[0]);
+    for (int i = index; i < op->n; i += stride)
+    {
+      op->y[i] = op->x[i] + op->y[i];
+    }
+}
+
+int main(void) {
+    Operation *op;
+
+    // Allocate Unified Memory -- accessible from CPU or GPU
+    cudaMallocManaged(&op, sizeof(Operation));
+    op->n = 100;
+
+    cudaMallocManaged(&op->x, op->n * sizeof(float));
+    cudaMallocManaged(&op->y, op->n * sizeof(float));
+
+    // initialize x and y arrays on the host
+    for (int i = 0; i < op->n; i++) {
+        op->x[i] = 1.0f;
+        op->y[i] = 2.0f;
+    }
+
+    // Launch kernel on n elements on the GPU
+    int blockSize = 256;
+    int numBlocks = (op->n + blockSize - 1) / blockSize;
+
+    std::cout << "numBlocks: " << numBlocks << std::endl;
+    std::cout << "X: " << &op->x << std::endl;
+
+    add<<<numBlocks, blockSize>>>(op);
+
+    // Wait for GPU to finish before accessing on host
+    cudaDeviceSynchronize();
+
+    // Log results for debugging
+    std::cout << "Results (y = x + y):" << std::endl;
+    for (int i = 0; i < op->n; i++) {
+        std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
+    }
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < op->n; i++) {
+        maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
+    }
+    std::cout << "Max error: " << maxError << std::endl;
+
+    // Free memory
+    cudaFree(op->x);
+    cudaFree(op->y);
+    cudaFree(op);
+
+    return 0;
+}