Merge branch 'main' of github.com:kevmo314/scuda

kevmo314 · Dec 20, 2024 · 644f22a · 644f22a
2 parents 5fb85af + d349b1b
commit 644f22a
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 88 deletions.
diff --git a/README.md b/README.md
@@ -3,30 +3,18 @@
 SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
 to CPU-only machines.
 
-## Demos
+## Demo
 
-### CUBLAS Matrix Multiplication
+### CUBLAS Matrix Multiplication using Unified Memory
 
 The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
 Left pane is a Mac running a docker container with nvidia utils installed.
 
-The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.
+The docker container runs this [matrixMulCUBLAS](./deploy/cublas_unified.o) example. This example not only uses cuBLAS, but also takes advantage of unified memory.
 
-You can view the docker image used [here](./deploy/Dockerfile.cublas-test).
-
-https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499
-
-### Simple torch example
-
-The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
-Left pane is a Mac running a docker container with nvidia utils installed.
-
-The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.
-
-You can view the docker image used [here](./deploy/Dockerfile.torch-test).
-
-https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
+You can view the docker image used [here](./deploy/Dockerfile.unified).
 
+https://github.com/user-attachments/assets/b2db5d82-f214-41cf-8274-b913c04080f9
 
 ## Local development
 

diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -19,6 +19,7 @@ extern int rpc_end_request(const int index);
 extern int rpc_wait_for_response(const int index);
 extern int rpc_read(const int index, void *data, const std::size_t size);
 extern int rpc_end_response(const int index, void *return_value);
+void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
 extern int rpc_close();
 
 nvmlReturn_t nvmlInit_v2()
@@ -18581,6 +18582,7 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr
 
 cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
 {
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
     cublasStatus_t return_value;
     if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
         rpc_write(0, &batchCount, sizeof(int)) < 0 ||
@@ -18603,6 +18605,7 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
         rpc_wait_for_response(0) < 0 ||
         rpc_end_response(0, &return_value) < 0)
         return CUBLAS_STATUS_NOT_INITIALIZED;
+    cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
     return return_value;
 }
 

diff --git a/codegen/manual_server.cpp b/codegen/manual_server.cpp
@@ -34,16 +34,15 @@ int handle_cudaMemcpy(void *conn)
     enum cudaMemcpyKind kind;
     int ret = -1;
 
-    if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0)
+    if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
+        (kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
+        (kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
+        rpc_read(conn, &count, sizeof(size_t)) < 0)
         goto ERROR_0;
 
     switch (kind)
     {
     case cudaMemcpyDeviceToHost:
-        if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         host_data = malloc(count);
         if (host_data == NULL)
             goto ERROR_0;
@@ -53,16 +52,8 @@ int handle_cudaMemcpy(void *conn)
             goto ERROR_1;
 
         result = cudaMemcpy(host_data, src, count, kind);
-
-        if (rpc_start_response(conn, request_id) < 0 ||
-            rpc_write(conn, host_data, count) < 0)
-            goto ERROR_1;
         break;
     case cudaMemcpyHostToDevice:
-        if (rpc_read(conn, &dst, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         host_data = malloc(count);
         if (host_data == NULL)
             goto ERROR_0;
@@ -75,33 +66,25 @@ int handle_cudaMemcpy(void *conn)
             goto ERROR_1;
 
         result = cudaMemcpy(dst, host_data, count, kind);
-
-        if (rpc_start_response(conn, request_id) < 0)
-            goto ERROR_1;
         break;
     case cudaMemcpyDeviceToDevice:
-        if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
-            rpc_read(conn, &dst, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         request_id = rpc_end_request(conn);
         if (request_id < 0)
             goto ERROR_0;
 
         result = cudaMemcpy(dst, src, count, kind);
-
-        if (rpc_start_response(conn, request_id) < 0)
-            goto ERROR_0;
         break;
     }
 
-    if (rpc_end_response(conn, &result) < 0)
+    if (rpc_start_response(conn, request_id) < 0 ||
+        (kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
+        rpc_end_response(conn, &result) < 0)
         goto ERROR_1;
 
     ret = 0;
 ERROR_1:
-    free((void *)host_data);
+    if (host_data != NULL)
+        free((void *)host_data);
 ERROR_0:
     return ret;
 }
@@ -121,75 +104,56 @@ int handle_cudaMemcpyAsync(void *conn)
 
     if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
         rpc_read(conn, &stream_null_check, sizeof(int)) < 0 ||
-        (stream_null_check == 0 && rpc_read(conn, &stream, sizeof(cudaStream_t)) < 0))
+        (stream_null_check == 0 && rpc_read(conn, &stream, sizeof(cudaStream_t)) < 0) ||
+        (kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
+        (kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
+        rpc_read(conn, &count, sizeof(size_t)) < 0)
         goto ERROR_0;
 
     switch (kind)
     {
     case cudaMemcpyDeviceToHost:
-        if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         host_data = malloc(count);
         if (host_data == NULL)
             goto ERROR_0;
 
         request_id = rpc_end_request(conn);
         if (request_id < 0)
-            goto ERROR_1;
+            goto ERROR_0;
 
         result = cudaMemcpyAsync(host_data, src, count, kind, stream);
-
-        if (rpc_start_response(conn, request_id) < 0 ||
-            rpc_write(conn, host_data, count) < 0)
-            goto ERROR_1;
         break;
     case cudaMemcpyHostToDevice:
-        if (rpc_read(conn, &dst, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         host_data = malloc(count);
         if (host_data == NULL)
             goto ERROR_0;
 
         if (rpc_read(conn, host_data, count) < 0)
-            goto ERROR_1;
+            goto ERROR_0;
 
         request_id = rpc_end_request(conn);
         if (request_id < 0)
-            goto ERROR_1;
+            goto ERROR_0;
 
         result = cudaMemcpyAsync(dst, host_data, count, kind, stream);
-
-        if (rpc_start_response(conn, request_id) < 0)
-            goto ERROR_1;
         break;
     case cudaMemcpyDeviceToDevice:
-        if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
-            rpc_read(conn, &dst, sizeof(void *)) < 0 ||
-            rpc_read(conn, &count, sizeof(size_t)) < 0)
-            goto ERROR_0;
-
         request_id = rpc_end_request(conn);
         if (request_id < 0)
             goto ERROR_0;
 
         result = cudaMemcpyAsync(dst, src, count, kind, stream);
-
-        if (rpc_start_response(conn, request_id) < 0)
-            goto ERROR_0;
         break;
     }
 
-    if (rpc_end_response(conn, &result) < 0 ||
-        cudaStreamSynchronize(stream) != cudaSuccess)
-        goto ERROR_1;
+    if (rpc_start_response(conn, request_id) < 0 ||
+        (kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
+        rpc_end_response(conn, &result) < 0 ||
+        (host_data != NULL && cudaStreamAddCallback(stream, [](cudaStream_t stream, cudaError_t status, void *ptr)
+                                                    { free(ptr); }, host_data, 0) != cudaSuccess))
+        goto ERROR_0;
 
     ret = 0;
-ERROR_1:
-    free((void *)host_data);
 ERROR_0:
     return ret;
 }
@@ -575,11 +539,11 @@ int handle___cudaRegisterVar(void *conn)
 
 int handle_cudaFree(void *conn)
 {
-    void* devPtr;
+    void *devPtr;
     int request_id;
     cudaError_t scuda_intercept_result;
     if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
+        rpc_read(conn, &devPtr, sizeof(void *)) < 0 ||
         false)
         goto ERROR_0;
 
@@ -599,13 +563,13 @@ int handle_cudaFree(void *conn)
 
 int handle_cudaMallocManaged(void *conn)
 {
-    void* devPtr;
+    void *devPtr;
     size_t size;
     unsigned int flags;
     int request_id;
     cudaError_t scuda_intercept_result;
     if (
-        rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
+        rpc_read(conn, &devPtr, sizeof(void *)) < 0 ||
         rpc_read(conn, &size, sizeof(size_t)) < 0 ||
         rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
         false)
@@ -617,7 +581,7 @@ int handle_cudaMallocManaged(void *conn)
     scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);
 
     if (rpc_start_response(conn, request_id) < 0 ||
-        rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
+        rpc_write(conn, &devPtr, sizeof(void *)) < 0 ||
         rpc_end_response(conn, &scuda_intercept_result) < 0)
         goto ERROR_0;
 

diff --git a/deploy/Dockerfile.unified b/deploy/Dockerfile.unified
@@ -29,6 +29,7 @@ ENV libscuda_path=/usr/local/lib/libscuda.so
 COPY ./libscuda.so /usr/local/lib/libscuda.so
 COPY unified.o unified.o
 COPY unified_pointer.o unified_pointer.o
+COPY cublas_unified.o cublas_unified.o
 
 COPY start.sh /start.sh
 RUN chmod +x /start.sh

diff --git a/deploy/start.sh b/deploy/start.sh
@@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
 elif [[ "$1" == "unified" ]]; then
     echo "Running cublas example..."
 
-    LD_PRELOAD="$libscuda_path" /unified_pointer.o
+    LD_PRELOAD="$libscuda_path" /cublas_unified.o
 else
     echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
 fi
diff --git a/local.sh b/local.sh
@@ -24,13 +24,6 @@ build() {
 
   echo "building vector file for test..."
 
-  nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
-  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
-
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
     exit 1
@@ -231,6 +224,18 @@ test() {
   done
 }
 
+build_tests() {
+  build
+
+  nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
+}
+
 run() {
   build
 
@@ -244,6 +249,9 @@ case "$1" in
   build)
     build
     ;;
+  build_tests)
+    build_tests
+    ;;
   run)
     run
     ;;