Skip to content

Commit

Permalink
Merge branch 'main' of github.com:kevmo314/scuda
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Dec 20, 2024
2 parents 5fb85af + d349b1b commit 644f22a
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 88 deletions.
22 changes: 5 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,18 @@
SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
to CPU-only machines.

## Demos
## Demo

### CUBLAS Matrix Multiplication
### CUBLAS Matrix Multiplication using Unified Memory

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.
The docker container runs this [matrixMulCUBLAS](./deploy/cublas_unified.o) example. This example not only uses cuBLAS, but also takes advantage of unified memory.

You can view the docker image used [here](./deploy/Dockerfile.cublas-test).

https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499

### Simple torch example

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.

You can view the docker image used [here](./deploy/Dockerfile.torch-test).

https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
You can view the docker image used [here](./deploy/Dockerfile.unified).

https://github.com/user-attachments/assets/b2db5d82-f214-41cf-8274-b913c04080f9

## Local development

Expand Down
3 changes: 3 additions & 0 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ extern int rpc_end_request(const int index);
extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
extern int rpc_close();

nvmlReturn_t nvmlInit_v2()
Expand Down Expand Up @@ -18581,6 +18582,7 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr

cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount)
{
cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);
cublasStatus_t return_value;
if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 ||
rpc_write(0, &batchCount, sizeof(int)) < 0 ||
Expand All @@ -18603,6 +18605,7 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return CUBLAS_STATUS_NOT_INITIALIZED;
cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);
return return_value;
}

Expand Down
90 changes: 27 additions & 63 deletions codegen/manual_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,15 @@ int handle_cudaMemcpy(void *conn)
enum cudaMemcpyKind kind;
int ret = -1;

if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0)
if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
(kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
(kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

switch (kind)
{
case cudaMemcpyDeviceToHost:
if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

host_data = malloc(count);
if (host_data == NULL)
goto ERROR_0;
Expand All @@ -53,16 +52,8 @@ int handle_cudaMemcpy(void *conn)
goto ERROR_1;

result = cudaMemcpy(host_data, src, count, kind);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, host_data, count) < 0)
goto ERROR_1;
break;
case cudaMemcpyHostToDevice:
if (rpc_read(conn, &dst, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

host_data = malloc(count);
if (host_data == NULL)
goto ERROR_0;
Expand All @@ -75,33 +66,25 @@ int handle_cudaMemcpy(void *conn)
goto ERROR_1;

result = cudaMemcpy(dst, host_data, count, kind);

if (rpc_start_response(conn, request_id) < 0)
goto ERROR_1;
break;
case cudaMemcpyDeviceToDevice:
if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
rpc_read(conn, &dst, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;

result = cudaMemcpy(dst, src, count, kind);

if (rpc_start_response(conn, request_id) < 0)
goto ERROR_0;
break;
}

if (rpc_end_response(conn, &result) < 0)
if (rpc_start_response(conn, request_id) < 0 ||
(kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
rpc_end_response(conn, &result) < 0)
goto ERROR_1;

ret = 0;
ERROR_1:
free((void *)host_data);
if (host_data != NULL)
free((void *)host_data);
ERROR_0:
return ret;
}
Expand All @@ -121,75 +104,56 @@ int handle_cudaMemcpyAsync(void *conn)

if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
rpc_read(conn, &stream_null_check, sizeof(int)) < 0 ||
(stream_null_check == 0 && rpc_read(conn, &stream, sizeof(cudaStream_t)) < 0))
(stream_null_check == 0 && rpc_read(conn, &stream, sizeof(cudaStream_t)) < 0) ||
(kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
(kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

switch (kind)
{
case cudaMemcpyDeviceToHost:
if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

host_data = malloc(count);
if (host_data == NULL)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_1;
goto ERROR_0;

result = cudaMemcpyAsync(host_data, src, count, kind, stream);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, host_data, count) < 0)
goto ERROR_1;
break;
case cudaMemcpyHostToDevice:
if (rpc_read(conn, &dst, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

host_data = malloc(count);
if (host_data == NULL)
goto ERROR_0;

if (rpc_read(conn, host_data, count) < 0)
goto ERROR_1;
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_1;
goto ERROR_0;

result = cudaMemcpyAsync(dst, host_data, count, kind, stream);

if (rpc_start_response(conn, request_id) < 0)
goto ERROR_1;
break;
case cudaMemcpyDeviceToDevice:
if (rpc_read(conn, &src, sizeof(void *)) < 0 ||
rpc_read(conn, &dst, sizeof(void *)) < 0 ||
rpc_read(conn, &count, sizeof(size_t)) < 0)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;

result = cudaMemcpyAsync(dst, src, count, kind, stream);

if (rpc_start_response(conn, request_id) < 0)
goto ERROR_0;
break;
}

if (rpc_end_response(conn, &result) < 0 ||
cudaStreamSynchronize(stream) != cudaSuccess)
goto ERROR_1;
if (rpc_start_response(conn, request_id) < 0 ||
(kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
rpc_end_response(conn, &result) < 0 ||
(host_data != NULL && cudaStreamAddCallback(stream, [](cudaStream_t stream, cudaError_t status, void *ptr)
{ free(ptr); }, host_data, 0) != cudaSuccess))
goto ERROR_0;

ret = 0;
ERROR_1:
free((void *)host_data);
ERROR_0:
return ret;
}
Expand Down Expand Up @@ -575,11 +539,11 @@ int handle___cudaRegisterVar(void *conn)

int handle_cudaFree(void *conn)
{
void* devPtr;
void *devPtr;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &devPtr, sizeof(void *)) < 0 ||
false)
goto ERROR_0;

Expand All @@ -599,13 +563,13 @@ int handle_cudaFree(void *conn)

int handle_cudaMallocManaged(void *conn)
{
void* devPtr;
void *devPtr;
size_t size;
unsigned int flags;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &devPtr, sizeof(void *)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
false)
Expand All @@ -617,7 +581,7 @@ int handle_cudaMallocManaged(void *conn)
scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_write(conn, &devPtr, sizeof(void *)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

Expand Down
1 change: 1 addition & 0 deletions deploy/Dockerfile.unified
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ENV libscuda_path=/usr/local/lib/libscuda.so
COPY ./libscuda.so /usr/local/lib/libscuda.so
COPY unified.o unified.o
COPY unified_pointer.o unified_pointer.o
COPY cublas_unified.o cublas_unified.o

COPY start.sh /start.sh
RUN chmod +x /start.sh
Expand Down
2 changes: 1 addition & 1 deletion deploy/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ elif [[ "$1" == "cublas" ]]; then
elif [[ "$1" == "unified" ]]; then
echo "Running cublas example..."

LD_PRELOAD="$libscuda_path" /unified_pointer.o
LD_PRELOAD="$libscuda_path" /cublas_unified.o
else
echo "Unknown option: $1. Please specify one of: torch | cublas | unified ."
fi
22 changes: 15 additions & 7 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@ build() {

echo "building vector file for test..."

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
exit 1
Expand Down Expand Up @@ -231,6 +224,18 @@ test() {
done
}

build_tests() {
build

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_batched.cu -o cublas_batched.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified.cu -o unified.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o
}

run() {
build

Expand All @@ -244,6 +249,9 @@ case "$1" in
build)
build
;;
build_tests)
build_tests
;;
run)
run
;;
Expand Down
Loading

0 comments on commit 644f22a

Please sign in to comment.