From 60739321498f9d21e6190f18be8149204585a174 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Wed, 18 Dec 2024 23:41:12 -0500 Subject: [PATCH] Brodey | unified memory (#62) --- client.cpp | 111 ++++++++++++++++++++++++++++++++++++++ codegen/annotations.h | 2 + codegen/codegen.py | 2 + codegen/gen_client.cpp | 29 +--------- codegen/gen_server.cpp | 53 ------------------ codegen/manual_client.cpp | 44 ++++++++++++++- codegen/manual_client.h | 2 + codegen/manual_server.cpp | 55 +++++++++++++++++++ codegen/manual_server.h | 2 + deploy/Dockerfile.unified | 37 +++++++++++++ deploy/start.sh | 6 ++- local.sh | 22 +++++++- test/unified.cu | 63 ++++++++++++++++++++++ test/unified_linked.cu | 54 +++++++++++++++++++ test/unified_pointer.cu | 70 ++++++++++++++++++++++++ 15 files changed, 467 insertions(+), 85 deletions(-) create mode 100644 deploy/Dockerfile.unified create mode 100644 test/unified.cu create mode 100644 test/unified_linked.cu create mode 100644 test/unified_pointer.cu diff --git a/client.cpp b/client.cpp index 3d5bff2..14def33 100644 --- a/client.cpp +++ b/client.cpp @@ -18,9 +18,17 @@ #include #include #include +#include #include +#include +#include +#include +#include +#include +#include + #include "codegen/gen_client.h" typedef struct @@ -34,6 +42,8 @@ typedef struct pthread_cond_t read_cond; struct iovec write_iov[128]; int write_iov_count = 0; + + std::unordered_map unified_devices; } conn_t; pthread_mutex_t conn_mutex; @@ -42,8 +52,76 @@ int nconns = 0; const char *DEFAULT_PORT = "14833"; +static int init = 0; +static jmp_buf catch_segfault; +static void* faulting_address = nullptr; + +static void segfault(int sig, siginfo_t* info, void* unused) { + faulting_address = info->si_addr; + + std::cout << "Caught segfault at address: " << faulting_address << std::endl; + + for (const auto & [ ptr, sz ] : conns[0].unified_devices) + { + if (ptr <= faulting_address && faulting_address < (ptr + sz)) + { + // ensure we assign memory as close to the faulting address as possible... + // by masking via the allocated unified memory size. + void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1)); + + // Allocate memory at the faulting address + void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (allocated == MAP_FAILED) { + perror("Failed to allocate memory at faulting address"); + _exit(1); + } + + std::cout << "allocated dynamic memory at address: " << allocated << std::endl; + + return; + } + } + + // raise our original segfault handler + struct sigaction sa; + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + + if (sigaction(SIGSEGV, &sa, nullptr) == -1) { + perror("Failed to reset SIGSEGV handler"); + _exit(EXIT_FAILURE); + } + + raise(SIGSEGV); +} + +static void set_segfault_handlers() { + if (init > 0) { + return; + } + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = segfault; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + std::cout << "Segfault handler installed." << std::endl; + + init = 1; +} + int rpc_open() { + set_segfault_handlers(); + + sigsetjmp(catch_segfault, 1); + if (pthread_mutex_lock(&conn_mutex) < 0) return -1; @@ -220,6 +298,39 @@ int rpc_read(const int index, void *data, size_t size) return n; } +void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size) +{ + // allocate new space for pointer mapping + conns[index].unified_devices.insert({ dev_ptr, size }); +} + +void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind) +{ + for (const auto & [ ptr, sz ] : conns[index].unified_devices) { + size_t size = reinterpret_cast(sz); + + // ptr is the same on both host/device + cudaError_t res = cudaMemcpy(ptr, ptr, size, kind); + if (res != cudaSuccess) { + std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl; + } else { + std::cout << "Successfully copied " << size << " bytes" << std::endl; + } + } +} + +void maybe_free_unified_mem(const int index, void *ptr) +{ + for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) { + size_t size = reinterpret_cast(sz); + + if (dev_ptr == ptr) { + munmap(dev_ptr, size); + return; + } + } +} + int rpc_end_response(const int index, void *result) { if (read(conns[index].connfd, result, sizeof(int)) < 0 || diff --git a/codegen/annotations.h b/codegen/annotations.h index ddaf798..42fafff 100644 --- a/codegen/annotations.h +++ b/codegen/annotations.h @@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f */ cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig); /** + * @disabled * @param devPtr SEND_RECV * @param size SEND_ONLY * @param flags SEND_ONLY @@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h */ cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags); /** + * @disabled * @param devPtr SEND_ONLY */ cudaError_t cudaFree(void *devPtr); diff --git a/codegen/codegen.py b/codegen/codegen.py index 65bc104..14d7b84 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -69,9 +69,11 @@ # a list of manually implemented cuda/nvml functions. # these are automatically appended to each file; operation order is maintained as well. MANUAL_IMPLEMENTATIONS = [ + "cudaFree", "cudaMemcpy", "cudaMemcpyAsync", "cudaLaunchKernel", + "cudaMallocManaged", ] @dataclass diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index b608b62..03e538b 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -9112,20 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c return return_value; } -cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) -{ - cudaError_t return_value; - if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 || - rpc_write(0, devPtr, sizeof(void*)) < 0 || - rpc_write(0, &size, sizeof(size_t)) < 0 || - rpc_write(0, &flags, sizeof(unsigned int)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_read(0, devPtr, sizeof(void*)) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; - return return_value; -} - cudaError_t cudaMalloc(void** devPtr, size_t size) { cudaError_t return_value; @@ -9183,17 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe return return_value; } -cudaError_t cudaFree(void* devPtr) -{ - cudaError_t return_value; - if (rpc_start_request(0, RPC_cudaFree) < 0 || - rpc_write(0, &devPtr, sizeof(void*)) < 0 || - rpc_wait_for_response(0) < 0 || - rpc_end_response(0, &return_value) < 0) - return cudaErrorDevicesUnavailable; - return return_value; -} - cudaError_t cudaFreeHost(void* ptr) { cudaError_t return_value; @@ -22154,12 +22129,10 @@ std::unordered_map functionMap = { {"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags}, {"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize}, {"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters}, - {"cudaMallocManaged", (void *)cudaMallocManaged}, {"cudaMalloc", (void *)cudaMalloc}, {"cudaMallocHost", (void *)cudaMallocHost}, {"cudaMallocPitch", (void *)cudaMallocPitch}, {"cudaMallocArray", (void *)cudaMallocArray}, - {"cudaFree", (void *)cudaFree}, {"cudaFreeHost", (void *)cudaFreeHost}, {"cudaFreeArray", (void *)cudaFreeArray}, {"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray}, @@ -22852,9 +22825,11 @@ std::unordered_map functionMap = { {"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync}, {"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync}, {"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync}, + {"cudaFree", (void *)cudaFree}, {"cudaMemcpy", (void *)cudaMemcpy}, {"cudaMemcpyAsync", (void *)cudaMemcpyAsync}, {"cudaLaunchKernel", (void *)cudaLaunchKernel}, + {"cudaMallocManaged", (void *)cudaMallocManaged}, }; void *get_function_pointer(const char *name) diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp index ab04e00..9a6cb8b 100644 --- a/codegen/gen_server.cpp +++ b/codegen/gen_server.cpp @@ -19387,35 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn) return -1; } -int handle_cudaMallocManaged(void *conn) -{ - void* devPtr; - size_t size; - unsigned int flags; - int request_id; - cudaError_t scuda_intercept_result; - if ( - rpc_read(conn, &devPtr, sizeof(void*)) < 0 || - rpc_read(conn, &size, sizeof(size_t)) < 0 || - rpc_read(conn, &flags, sizeof(unsigned int)) < 0 || - false) - goto ERROR_0; - - request_id = rpc_end_request(conn); - if (request_id < 0) - goto ERROR_0; - scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags); - - if (rpc_start_response(conn, request_id) < 0 || - rpc_write(conn, &devPtr, sizeof(void*)) < 0 || - rpc_end_response(conn, &scuda_intercept_result) < 0) - goto ERROR_0; - - return 0; -ERROR_0: - return -1; -} - int handle_cudaMalloc(void *conn) { void* devPtr; @@ -19534,30 +19505,6 @@ int handle_cudaMallocArray(void *conn) return -1; } -int handle_cudaFree(void *conn) -{ - void* devPtr; - int request_id; - cudaError_t scuda_intercept_result; - if ( - rpc_read(conn, &devPtr, sizeof(void*)) < 0 || - false) - goto ERROR_0; - - request_id = rpc_end_request(conn); - if (request_id < 0) - goto ERROR_0; - scuda_intercept_result = cudaFree(devPtr); - - if (rpc_start_response(conn, request_id) < 0 || - rpc_end_response(conn, &scuda_intercept_result) < 0) - goto ERROR_0; - - return 0; -ERROR_0: - return -1; -} - int handle_cudaFreeHost(void *conn) { void* ptr; diff --git a/codegen/manual_client.cpp b/codegen/manual_client.cpp index 02643cd..5f4aca6 100755 --- a/codegen/manual_client.cpp +++ b/codegen/manual_client.cpp @@ -24,6 +24,9 @@ extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); extern int rpc_close(); +void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind); +void* maybe_free_unified_mem(const int index, void *ptr); +extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size); #define MAX_FUNCTION_NAME 1024 #define MAX_ARGS 128 @@ -337,6 +340,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void { cudaError_t return_value; + cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice); + // Start the RPC request int request_id = rpc_start_request(0, RPC_cudaLaunchKernel); if (request_id < 0) @@ -395,6 +400,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void return cudaErrorDevicesUnavailable; } + cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost); + return return_value; } @@ -483,8 +490,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long if (type_size == 0) continue; arg_size = type_size; - - std::cout << "arg size: " << arg_size << std::endl; } else if (ptx_string[i] == '[') { @@ -705,6 +710,8 @@ extern "C" { void *return_value; + std::cout << "calling __cudaRegisterVar" << std::endl; + // Start the RPC request int request_id = rpc_start_request(0, RPC___cudaRegisterVar); if (request_id < 0) @@ -792,3 +799,36 @@ extern "C" } } } + +cudaError_t cudaFree(void* devPtr) +{ + cudaError_t return_value; + maybe_free_unified_mem(0, devPtr); + + if (rpc_start_request(0, RPC_cudaFree) < 0 || + rpc_write(0, &devPtr, sizeof(void*)) < 0 || + rpc_wait_for_response(0) < 0 || + rpc_end_response(0, &return_value) < 0) + return cudaErrorDevicesUnavailable; + + return return_value; +} + +cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags) +{ + void* d_mem; + + cudaError_t err = cudaMalloc((void**)&d_mem, size); + if (err != cudaSuccess) { + std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl; + return err; + } + + std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl; + + allocate_unified_mem_pointer(0, d_mem, size); + + *devPtr = d_mem; + + return cudaSuccess; +} diff --git a/codegen/manual_client.h b/codegen/manual_client.h index 1666ce3..a10bfcb 100644 --- a/codegen/manual_client.h +++ b/codegen/manual_client.h @@ -3,6 +3,8 @@ #include #include +cudaError_t cudaFree(void* devPtr); +cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags); cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp index 3054b57..6322cdb 100755 --- a/codegen/manual_server.cpp +++ b/codegen/manual_server.cpp @@ -240,6 +240,8 @@ int handle_cudaLaunchKernel(void *conn) result = cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); + std::cout << "Launch kern result: " << result << std::endl; + if (rpc_start_response(conn, request_id) < 0 || rpc_end_response(conn, &result) < 0) goto ERROR_1; @@ -570,3 +572,56 @@ int handle___cudaRegisterVar(void *conn) return 0; } + +int handle_cudaFree(void *conn) +{ + void* devPtr; + int request_id; + cudaError_t scuda_intercept_result; + if ( + rpc_read(conn, &devPtr, sizeof(void*)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cudaFree(devPtr); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} + +int handle_cudaMallocManaged(void *conn) +{ + void* devPtr; + size_t size; + unsigned int flags; + int request_id; + cudaError_t scuda_intercept_result; + if ( + rpc_read(conn, &devPtr, sizeof(void*)) < 0 || + rpc_read(conn, &size, sizeof(size_t)) < 0 || + rpc_read(conn, &flags, sizeof(unsigned int)) < 0 || + false) + goto ERROR_0; + + request_id = rpc_end_request(conn); + if (request_id < 0) + goto ERROR_0; + scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags); + + if (rpc_start_response(conn, request_id) < 0 || + rpc_write(conn, &devPtr, sizeof(void*)) < 0 || + rpc_end_response(conn, &scuda_intercept_result) < 0) + goto ERROR_0; + + return 0; +ERROR_0: + return -1; +} diff --git a/codegen/manual_server.h b/codegen/manual_server.h index 11e4f11..4449dea 100644 --- a/codegen/manual_server.h +++ b/codegen/manual_server.h @@ -2,9 +2,11 @@ #include #include +int handle_cudaFree(void *conn); int handle_cudaMemcpy(void *conn); int handle_cudaMemcpyAsync(void *conn); int handle_cudaLaunchKernel(void *conn); +int handle_cudaMallocManaged(void *conn); int handle___cudaRegisterVar(void *conn); int handle___cudaRegisterFunction(void *conn); int handle___cudaRegisterFatBinary(void *conn); diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified new file mode 100644 index 0000000..16b64b1 --- /dev/null +++ b/deploy/Dockerfile.unified @@ -0,0 +1,37 @@ +FROM ubuntu:24.04 + +RUN apt-get update && apt-get install -y \ + build-essential \ + wget \ + curl \ + python3 \ + python3-pip \ + gnupg \ + software-properties-common && \ + add-apt-repository 'deb http://archive.ubuntu.com/ubuntu jammy main universe' && \ + apt-get update && \ + apt-get install -y libtinfo5 && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + rm cuda-keyring_1.0-1_all.deb && \ + apt-get update + +RUN apt-get install -y cuda-toolkit-12-2 + +ENV PATH=/usr/local/cuda-12.2/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64 + +ENV SCUDA_SERVER=71.183.65.76 +ENV libscuda_path=/usr/local/lib/libscuda.so + +COPY ./libscuda.so /usr/local/lib/libscuda.so +COPY unified.o unified.o +COPY unified_pointer.o unified_pointer.o + +COPY start.sh /start.sh +RUN chmod +x /start.sh +RUN chmod +x /unified.o + +CMD ["/bin/bash", "/start.sh", "unified"] \ No newline at end of file diff --git a/deploy/start.sh b/deploy/start.sh index aa66e39..d24f11e 100644 --- a/deploy/start.sh +++ b/deploy/start.sh @@ -10,6 +10,10 @@ elif [[ "$1" == "cublas" ]]; then echo "Running cublas example..." LD_PRELOAD="$libscuda_path" /matrixMulCUBLAS +elif [[ "$1" == "unified" ]]; then + echo "Running cublas example..." + + LD_PRELOAD="$libscuda_path" /unified_pointer.o else - echo "Unknown option: $1. Please specify 'torch' or 'cublas'." + echo "Unknown option: $1. Please specify one of: torch | cublas | unified ." fi \ No newline at end of file diff --git a/local.sh b/local.sh index caa9c70..1447676 100755 --- a/local.sh +++ b/local.sh @@ -26,8 +26,10 @@ build() { nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o - nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o if [ ! -f "$libscuda_path" ]; then echo "libscuda.so not found. build may have failed." @@ -163,6 +165,17 @@ test_cublas_batched() { fi } +test_unified_mem() { + output=$(LD_PRELOAD="$libscuda_path" ./unified_pointer.o | tail -n 1) + + if [[ "$output" == "Max error: 0" ]]; then + ansi_format "pass" "$pass_message" + else + ansi_format "fail" "vector_add failed. Got [$output]." + return 1 + fi +} + #---- declare test cases ----# declare -A test_cuda_avail=( ["function"]="test_cuda_available" @@ -194,8 +207,13 @@ declare -A test_cublas_batched=( ["pass"]="Batched cublas works via test/cublas_batched.cu." ) +declare -A test_unified_mem=( + ["function"]="test_unified_mem" + ["pass"]="Unified memory works as expected." +) + #---- assign them to our associative array ----# -tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn" "test_cublas_batched") +tests=("test_cuda_avail" "test_tensor_to_cuda" "test_tensor_to_cuda_to_cpu" "test_vector_add" "test_cudnn" "test_cublas_batched" "test_unified_mem") test() { build diff --git a/test/unified.cu b/test/unified.cu new file mode 100644 index 0000000..60adcad --- /dev/null +++ b/test/unified.cu @@ -0,0 +1,63 @@ +#include +#include + +// CUDA Kernel to add elements of two arrays +// __global__ void addKernel(int *a, int *b, int *c, int size) { +// int idx = threadIdx.x + blockIdx.x * blockDim.x; +// if (idx < size) { +// c[idx] = a[idx] * b[idx]; +// } +// } + +__global__ void mulKernel(int *a, int *c, int size) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < size) { + c[idx] = a[idx] * 100; + } +} + +int main() { + + // Define array size + const int size = 10; + const int bytes = size * sizeof(int); + + // Unified memory allocation + int *a, *c; + + cudaMallocManaged(&c, bytes); + cudaMallocManaged(&a, bytes); + + // Initialize arrays on the CPU + for (int i = 0; i < size; ++i) { + a[i] = i; + } + + // Define kernel launch parameters + const int threadsPerBlock = 256; + const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; + + std::cout << "launching kernel..." << std::endl; + + for (int i = 0; i < size; ++i) { + std::cout << "a[" << i << "] + b[" << i << "] = " << a[i] << "\n"; + } + + // Launch the kernel + mulKernel<<>>(a, c, size); + + // Wait for GPU to finish + cudaDeviceSynchronize(); + + // Display results + std::cout << "Results:\n"; + for (int i = 0; i < size; ++i) { + std::cout << "a[" << i << "] + b[" << i << "] = " << c[i] << "\n"; + } + + // Free unified memory + cudaFree(a); + cudaFree(c); + + return 0; +} diff --git a/test/unified_linked.cu b/test/unified_linked.cu new file mode 100644 index 0000000..3bda6f2 --- /dev/null +++ b/test/unified_linked.cu @@ -0,0 +1,54 @@ +#include +#include +// error checking macro +#define cudaCheckErrors(msg) \ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ + msg, cudaGetErrorString(__err), \ + __FILE__, __LINE__); \ + fprintf(stderr, "*** FAILED - ABORTING\n"); \ + exit(1); \ + } \ + } while (0) + +struct list_elem { + int key; + list_elem *next; +}; + +template +void alloc_bytes(T &ptr, size_t num_bytes){ + + cudaMallocManaged(&ptr, num_bytes); +} + +__host__ __device__ +void print_element(list_elem *list, int ele_num){ + list_elem *elem = list; + for (int i = 0; i < ele_num; i++) + elem = elem->next; + printf("key = %d\n", elem->key); +} + +__global__ void gpu_print_element(list_elem *list, int ele_num){ + print_element(list, ele_num); +} + +const int num_elem = 5; +const int ele = 3; +int main(){ + + list_elem *list_base, *list; + alloc_bytes(list_base, sizeof(list_elem)); + list = list_base; + for (int i = 0; i < num_elem; i++){ + list->key = i; + alloc_bytes(list->next, sizeof(list_elem)); + list = list->next;} + print_element(list_base, ele); + gpu_print_element<<<1,1>>>(list_base, ele); + cudaDeviceSynchronize(); + cudaCheckErrors("cuda error!"); +} \ No newline at end of file diff --git a/test/unified_pointer.cu b/test/unified_pointer.cu new file mode 100644 index 0000000..1e39104 --- /dev/null +++ b/test/unified_pointer.cu @@ -0,0 +1,70 @@ +#include +#include + +struct Operation { + float *x; + float *y; + int n; +}; + +// CUDA kernel to add elements of two arrays +__global__ void add(Operation *op) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + printf("The X is: %x\n", op->x[0]); + printf("The Y is: %x\n", op->y[0]); + for (int i = index; i < op->n; i += stride) + { + op->y[i] = op->x[i] + op->y[i]; + } +} + +int main(void) { + Operation *op; + + // Allocate Unified Memory -- accessible from CPU or GPU + cudaMallocManaged(&op, sizeof(Operation)); + op->n = 100; + + cudaMallocManaged(&op->x, op->n * sizeof(float)); + cudaMallocManaged(&op->y, op->n * sizeof(float)); + + // initialize x and y arrays on the host + for (int i = 0; i < op->n; i++) { + op->x[i] = 1.0f; + op->y[i] = 2.0f; + } + + // Launch kernel on n elements on the GPU + int blockSize = 256; + int numBlocks = (op->n + blockSize - 1) / blockSize; + + std::cout << "numBlocks: " << numBlocks << std::endl; + std::cout << "X: " << &op->x << std::endl; + + add<<>>(op); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); + + // Log results for debugging + std::cout << "Results (y = x + y):" << std::endl; + for (int i = 0; i < op->n; i++) { + std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl; + } + + // Check for errors (all values should be 3.0f) + float maxError = 0.0f; + for (int i = 0; i < op->n; i++) { + maxError = fmax(maxError, fabs(op->y[i] - 3.0f)); + } + std::cout << "Max error: " << maxError << std::endl; + + // Free memory + cudaFree(op->x); + cudaFree(op->y); + cudaFree(op); + + return 0; +}