kevmo314 · brodeynewman · Dec 19, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/client.cpp b/client.cpp
@@ -18,9 +18,17 @@
 #include <cuda.h>
 #include <sys/uio.h>
 #include <netinet/tcp.h>
+#include <cuda_runtime.h>
 
 #include <unordered_map>
 
+#include <setjmp.h>
+#include <signal.h>
+#include <csignal>
+#include <cstdlib>
+#include <cstring>
+#include <sys/mman.h>
+
 #include "codegen/gen_client.h"
 
 typedef struct
@@ -34,6 +42,8 @@ typedef struct
     pthread_cond_t read_cond;
     struct iovec write_iov[128];
     int write_iov_count = 0;
+
+    std::unordered_map<void*, size_t> unified_devices;
 } conn_t;
 
 pthread_mutex_t conn_mutex;
@@ -42,8 +52,76 @@ int nconns = 0;
 
 const char *DEFAULT_PORT = "14833";
 
+static int init = 0;
+static jmp_buf catch_segfault;
+static void* faulting_address = nullptr;
+
+static void segfault(int sig, siginfo_t* info, void* unused) {
+    faulting_address = info->si_addr; 
+
+    std::cout << "Caught segfault at address: " << faulting_address << std::endl;
+
+    for (const auto & [ ptr, sz ] : conns[0].unified_devices)
+    {
+        if (ptr <= faulting_address && faulting_address < (ptr + sz))
+        {
+            // ensure we assign memory as close to the faulting address as possible...
+            // by masking via the allocated unified memory size.
+            void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));
+
+            // Allocate memory at the faulting address
+            void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+            if (allocated == MAP_FAILED) {
+                perror("Failed to allocate memory at faulting address");
+                _exit(1);
+            }
+
+            std::cout << "allocated dynamic memory at address: " << allocated << std::endl;
+
+            return;
+        }
+    }
+
+    // raise our original segfault handler
+    struct sigaction sa;
+    sa.sa_handler = SIG_DFL;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+
+    if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
+        perror("Failed to reset SIGSEGV handler");
+        _exit(EXIT_FAILURE);
+    }
+
+    raise(SIGSEGV);
+}
+
+static void set_segfault_handlers() {
+    if (init > 0) {
+        return;
+    }
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_sigaction = segfault;
+
+    if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+        perror("sigaction");
+        exit(EXIT_FAILURE);
+    }
+
+    std::cout << "Segfault handler installed." << std::endl;
+
+    init = 1;
+}
+
 int rpc_open()
 {
+    set_segfault_handlers();
+
+    sigsetjmp(catch_segfault, 1);
+
     if (pthread_mutex_lock(&conn_mutex) < 0)
         return -1;
 
@@ -220,6 +298,39 @@ int rpc_read(const int index, void *data, size_t size)
     return n;
 }
 
+void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
+{
+    // allocate new space for pointer mapping
+    conns[index].unified_devices.insert({ dev_ptr, size });
+}
+
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
+{
+    for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
+        size_t size = reinterpret_cast<size_t>(sz);
+
+        // ptr is the same on both host/device
+        cudaError_t res = cudaMemcpy(ptr, ptr, size, kind);
+        if (res != cudaSuccess) {
+            std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
+        } else {
+            std::cout << "Successfully copied " << size << " bytes" << std::endl;
+        }
+    }
+}
+
+void maybe_free_unified_mem(const int index, void *ptr)
+{
+    for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
+        size_t size = reinterpret_cast<size_t>(sz);
+
+        if (dev_ptr == ptr) {
+            munmap(dev_ptr, size);
+            return;
+        }
+    }
+}
+
 int rpc_end_response(const int index, void *result)
 {
     if (read(conns[index].connfd, result, sizeof(int)) < 0 ||

diff --git a/codegen/annotations.h b/codegen/annotations.h
@@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
  */
 cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
 /**
+ * @disabled
  * @param devPtr SEND_RECV
  * @param size SEND_ONLY
  * @param flags SEND_ONLY
@@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
  */
 cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
 /**
+ * @disabled
  * @param devPtr SEND_ONLY
  */
 cudaError_t cudaFree(void *devPtr);

diff --git a/codegen/codegen.py b/codegen/codegen.py
@@ -69,9 +69,11 @@
 # a list of manually implemented cuda/nvml functions.
 # these are automatically appended to each file; operation order is maintained as well.
 MANUAL_IMPLEMENTATIONS = [
+    "cudaFree",
     "cudaMemcpy",
     "cudaMemcpyAsync",
     "cudaLaunchKernel",
+    "cudaMallocManaged",
 ]
 
 @dataclass

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -9112,20 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
     return return_value;
 }
 
-cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
-{
-    cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
-        rpc_write(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_write(0, &size, sizeof(size_t)) < 0 ||
-        rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_read(0, devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
-    return return_value;
-}
-
 cudaError_t cudaMalloc(void** devPtr, size_t size)
 {
     cudaError_t return_value;
@@ -9183,17 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
     return return_value;
 }
 
-cudaError_t cudaFree(void* devPtr)
-{
-    cudaError_t return_value;
-    if (rpc_start_request(0, RPC_cudaFree) < 0 ||
-        rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
-        rpc_wait_for_response(0) < 0 ||
-        rpc_end_response(0, &return_value) < 0)
-        return cudaErrorDevicesUnavailable;
-    return return_value;
-}
-
 cudaError_t cudaFreeHost(void* ptr)
 {
     cudaError_t return_value;
@@ -22154,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
     {"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
     {"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
-    {"cudaMallocManaged", (void *)cudaMallocManaged},
     {"cudaMalloc", (void *)cudaMalloc},
     {"cudaMallocHost", (void *)cudaMallocHost},
     {"cudaMallocPitch", (void *)cudaMallocPitch},
     {"cudaMallocArray", (void *)cudaMallocArray},
-    {"cudaFree", (void *)cudaFree},
     {"cudaFreeHost", (void *)cudaFreeHost},
     {"cudaFreeArray", (void *)cudaFreeArray},
     {"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
@@ -22852,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
     {"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
     {"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
+    {"cudaFree", (void *)cudaFree},
     {"cudaMemcpy", (void *)cudaMemcpy},
     {"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
     {"cudaLaunchKernel", (void *)cudaLaunchKernel},
+    {"cudaMallocManaged", (void *)cudaMallocManaged},
 };
 
 void *get_function_pointer(const char *name)

diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
@@ -19387,35 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
     return -1;
 }
 
-int handle_cudaMallocManaged(void *conn)
-{
-    void* devPtr;
-    size_t size;
-    unsigned int flags;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_read(conn, &size, sizeof(size_t)) < 0 ||
-        rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
-        false)
-        goto ERROR_0;
-
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaMalloc(void *conn)
 {
     void* devPtr;
@@ -19534,30 +19505,6 @@ int handle_cudaMallocArray(void *conn)
     return -1;
 }
 
-int handle_cudaFree(void *conn)
-{
-    void* devPtr;
-    int request_id;
-    cudaError_t scuda_intercept_result;
-    if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
-        false)
-        goto ERROR_0;
-
-    request_id = rpc_end_request(conn);
-    if (request_id < 0)
-        goto ERROR_0;
-    scuda_intercept_result = cudaFree(devPtr);
-
-    if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_end_response(conn, &scuda_intercept_result) < 0)
-        goto ERROR_0;
-
-    return 0;
-ERROR_0:
-    return -1;
-}
-
 int handle_cudaFreeHost(void *conn)
 {
     void* ptr;

diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp
@@ -24,6 +24,9 @@ extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
 extern int rpc_close();
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
+void* maybe_free_unified_mem(const int index, void *ptr);
+extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);
 
 #define MAX_FUNCTION_NAME 1024
 #define MAX_ARGS 128
@@ -337,6 +340,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
 {
     cudaError_t return_value;
 
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
+
     // Start the RPC request
     int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
     if (request_id < 0)
@@ -395,6 +400,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
         return cudaErrorDevicesUnavailable;
     }
 
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
+
     return return_value;
 }
 
@@ -483,8 +490,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
                         if (type_size == 0)
                             continue;
                         arg_size = type_size;
-
-                        std::cout << "arg size: " << arg_size << std::endl;
                     }
                     else if (ptx_string[i] == '[')
                     {
@@ -705,6 +710,8 @@ extern "C"
     {
         void *return_value;
 
+        std::cout << "calling __cudaRegisterVar" << std::endl;
+
         // Start the RPC request
         int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
         if (request_id < 0)
@@ -792,3 +799,36 @@ extern "C"
         }
     }
 }
+
+cudaError_t cudaFree(void* devPtr)
+{
+    cudaError_t return_value;
+    maybe_free_unified_mem(0, devPtr);
+
+    if (rpc_start_request(0, RPC_cudaFree) < 0 ||
+        rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return cudaErrorDevicesUnavailable;
+
+    return return_value;
+}
+
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
+{
+    void* d_mem;
+
+    cudaError_t err = cudaMalloc((void**)&d_mem, size);
+    if (err != cudaSuccess) {
+        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
+        return err;
+    }
+
+    std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl;
+
+    allocate_unified_mem_pointer(0, d_mem, size);
+
+    *devPtr = d_mem;
+
+    return cudaSuccess;
+}
diff --git a/codegen/manual_client.h b/codegen/manual_client.h
@@ -3,6 +3,8 @@
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
+cudaError_t cudaFree(void* devPtr);
+cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags);
 cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
 cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);