Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Brodey | unified memory #62

Merged
merged 34 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
350c7d8
chore: bm
brodeynewman Oct 9, 2024
1ad672a
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 9, 2024
59150ee
chore: merge
brodeynewman Oct 9, 2024
c7d0b7d
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 11, 2024
29a919e
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 14, 2024
233b8e9
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 14, 2024
fc00189
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 17, 2024
79ccd26
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 23, 2024
38a351c
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Oct 29, 2024
ab2e209
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 6, 2024
ccd7c31
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 8, 2024
25cad41
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 9, 2024
11f8e43
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 11, 2024
8e3d836
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 18, 2024
e20c750
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Nov 28, 2024
8f56379
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 2, 2024
e5592dc
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 3, 2024
aeef059
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 7, 2024
3d29fc8
chore: capture device pointers
brodeynewman Dec 8, 2024
5b890da
chore: unified working example
brodeynewman Dec 13, 2024
fdf8853
chore: extern
brodeynewman Dec 13, 2024
07b0e2a
chore: todo
brodeynewman Dec 13, 2024
3740492
chore: map original host pointer
brodeynewman Dec 15, 2024
516909a
chore: cleanup
brodeynewman Dec 15, 2024
e6ec613
chore: cleanup
brodeynewman Dec 15, 2024
062360c
chore: build
brodeynewman Dec 16, 2024
83d1ebc
Merge branch 'main' of github.com:kevmo314/scuda
brodeynewman Dec 16, 2024
a6683b4
chore: mmap
brodeynewman Dec 18, 2024
fcc8669
chore: unified pointer ex
brodeynewman Dec 18, 2024
a90da39
chore: cleanup
brodeynewman Dec 19, 2024
2d744fd
chore: manual server
brodeynewman Dec 19, 2024
94a7411
Merge branch 'main' into brodey/unified
brodeynewman Dec 19, 2024
f687e92
chore: additional cleanup
brodeynewman Dec 19, 2024
00d6915
chore: unified pointer test
brodeynewman Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@
#include <cuda.h>
#include <sys/uio.h>
#include <netinet/tcp.h>
#include <cuda_runtime.h>

#include <unordered_map>

#include <setjmp.h>
#include <signal.h>
#include <csignal>
#include <cstdlib>
#include <cstring>
#include <sys/mman.h>

#include "codegen/gen_client.h"

typedef struct
Expand All @@ -34,6 +42,8 @@ typedef struct
pthread_cond_t read_cond;
struct iovec write_iov[128];
int write_iov_count = 0;

std::unordered_map<void*, size_t> unified_devices;
} conn_t;

pthread_mutex_t conn_mutex;
Expand All @@ -42,8 +52,76 @@ int nconns = 0;

const char *DEFAULT_PORT = "14833";

static int init = 0;
static jmp_buf catch_segfault;
static void* faulting_address = nullptr;

static void segfault(int sig, siginfo_t* info, void* unused) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use void* unused to pass the parent segfault handler?

faulting_address = info->si_addr;

std::cout << "Caught segfault at address: " << faulting_address << std::endl;

for (const auto & [ ptr, sz ] : conns[0].unified_devices)
{
if (ptr <= faulting_address && faulting_address < (ptr + sz))
{
// ensure we assign memory as close to the faulting address as possible...
// by masking via the allocated unified memory size.
void* aligned_address = (void*)((uintptr_t)faulting_address & ~(sz - 1));

// Allocate memory at the faulting address
void* allocated = mmap(aligned_address, sz, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (allocated == MAP_FAILED) {
perror("Failed to allocate memory at faulting address");
_exit(1);
}

std::cout << "allocated dynamic memory at address: " << allocated << std::endl;

return;
}
}

// raise our original segfault handler
struct sigaction sa;
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;

if (sigaction(SIGSEGV, &sa, nullptr) == -1) {
perror("Failed to reset SIGSEGV handler");
_exit(EXIT_FAILURE);
}

raise(SIGSEGV);
}

static void set_segfault_handlers() {
if (init > 0) {
return;
}

struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = segfault;

if (sigaction(SIGSEGV, &sa, NULL) == -1) {
perror("sigaction");
exit(EXIT_FAILURE);
}

std::cout << "Segfault handler installed." << std::endl;

init = 1;
}

int rpc_open()
{
set_segfault_handlers();

sigsetjmp(catch_segfault, 1);

if (pthread_mutex_lock(&conn_mutex) < 0)
return -1;

Expand Down Expand Up @@ -220,6 +298,39 @@ int rpc_read(const int index, void *data, size_t size)
return n;
}

void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size)
{
// allocate new space for pointer mapping
conns[index].unified_devices.insert({ dev_ptr, size });
}

void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind)
{
for (const auto & [ ptr, sz ] : conns[index].unified_devices) {
size_t size = reinterpret_cast<size_t>(sz);

// ptr is the same on both host/device
cudaError_t res = cudaMemcpy(ptr, ptr, size, kind);
if (res != cudaSuccess) {
std::cerr << "cudaMemcpy failed :" << cudaGetErrorString(res) << std::endl;
} else {
std::cout << "Successfully copied " << size << " bytes" << std::endl;
}
}
}

void maybe_free_unified_mem(const int index, void *ptr)
{
for (const auto & [ dev_ptr, sz ] : conns[index].unified_devices) {
size_t size = reinterpret_cast<size_t>(sz);

if (dev_ptr == ptr) {
munmap(dev_ptr, size);
return;
}
}
}

int rpc_end_response(const int index, void *result)
{
if (read(conns[index].connfd, result, sizeof(int)) < 0 ||
Expand Down
2 changes: 2 additions & 0 deletions codegen/annotations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4357,6 +4357,7 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *f
*/
cudaError_t cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
/**
* @disabled
* @param devPtr SEND_RECV
* @param size SEND_ONLY
* @param flags SEND_ONLY
Expand Down Expand Up @@ -4388,6 +4389,7 @@ cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t h
*/
cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, unsigned int flags);
/**
* @disabled
* @param devPtr SEND_ONLY
*/
cudaError_t cudaFree(void *devPtr);
Expand Down
2 changes: 2 additions & 0 deletions codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@
# a list of manually implemented cuda/nvml functions.
# these are automatically appended to each file; operation order is maintained as well.
MANUAL_IMPLEMENTATIONS = [
"cudaFree",
"cudaMemcpy",
"cudaMemcpyAsync",
"cudaLaunchKernel",
"cudaMallocManaged",
]

@dataclass
Expand Down
29 changes: 2 additions & 27 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9112,20 +9112,6 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
return return_value;
}

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
rpc_write(0, devPtr, sizeof(void*)) < 0 ||
rpc_write(0, &size, sizeof(size_t)) < 0 ||
rpc_write(0, &flags, sizeof(unsigned int)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_read(0, devPtr, sizeof(void*)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
return return_value;
}

cudaError_t cudaMalloc(void** devPtr, size_t size)
{
cudaError_t return_value;
Expand Down Expand Up @@ -9183,17 +9169,6 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe
return return_value;
}

cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;
return return_value;
}

cudaError_t cudaFreeHost(void* ptr)
{
cudaError_t return_value;
Expand Down Expand Up @@ -22154,12 +22129,10 @@ std::unordered_map<std::string, void *> functionMap = {
{"cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", (void *)cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags},
{"cudaOccupancyMaxPotentialClusterSize", (void *)cudaOccupancyMaxPotentialClusterSize},
{"cudaOccupancyMaxActiveClusters", (void *)cudaOccupancyMaxActiveClusters},
{"cudaMallocManaged", (void *)cudaMallocManaged},
{"cudaMalloc", (void *)cudaMalloc},
{"cudaMallocHost", (void *)cudaMallocHost},
{"cudaMallocPitch", (void *)cudaMallocPitch},
{"cudaMallocArray", (void *)cudaMallocArray},
{"cudaFree", (void *)cudaFree},
{"cudaFreeHost", (void *)cudaFreeHost},
{"cudaFreeArray", (void *)cudaFreeArray},
{"cudaFreeMipmappedArray", (void *)cudaFreeMipmappedArray},
Expand Down Expand Up @@ -22852,9 +22825,11 @@ std::unordered_map<std::string, void *> functionMap = {
{"cuMemFreeAsync_ptsz", (void *)cuMemFreeAsync},
{"cuMemAllocAsync_ptsz", (void *)cuMemAllocAsync},
{"cuMemAllocFromPoolAsync_ptsz", (void *)cuMemAllocFromPoolAsync},
{"cudaFree", (void *)cudaFree},
{"cudaMemcpy", (void *)cudaMemcpy},
{"cudaMemcpyAsync", (void *)cudaMemcpyAsync},
{"cudaLaunchKernel", (void *)cudaLaunchKernel},
{"cudaMallocManaged", (void *)cudaMallocManaged},
};

void *get_function_pointer(const char *name)
Expand Down
53 changes: 0 additions & 53 deletions codegen/gen_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19387,35 +19387,6 @@ int handle_cudaOccupancyMaxActiveClusters(void *conn)
return -1;
}

int handle_cudaMallocManaged(void *conn)
{
void* devPtr;
size_t size;
unsigned int flags;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
rpc_read(conn, &flags, sizeof(unsigned int)) < 0 ||
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaMallocManaged(&devPtr, size, flags);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaMalloc(void *conn)
{
void* devPtr;
Expand Down Expand Up @@ -19534,30 +19505,6 @@ int handle_cudaMallocArray(void *conn)
return -1;
}

int handle_cudaFree(void *conn)
{
void* devPtr;
int request_id;
cudaError_t scuda_intercept_result;
if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
scuda_intercept_result = cudaFree(devPtr);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_end_response(conn, &scuda_intercept_result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

int handle_cudaFreeHost(void *conn)
{
void* ptr;
Expand Down
44 changes: 42 additions & 2 deletions codegen/manual_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ extern int rpc_wait_for_response(const int index);
extern int rpc_read(const int index, void *data, const std::size_t size);
extern int rpc_end_response(const int index, void *return_value);
extern int rpc_close();
void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind);
void* maybe_free_unified_mem(const int index, void *ptr);
extern void allocate_unified_mem_pointer(const int index, void *dev_ptr, size_t size);

#define MAX_FUNCTION_NAME 1024
#define MAX_ARGS 128
Expand Down Expand Up @@ -337,6 +340,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
{
cudaError_t return_value;

cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice);

// Start the RPC request
int request_id = rpc_start_request(0, RPC_cudaLaunchKernel);
if (request_id < 0)
Expand Down Expand Up @@ -395,6 +400,8 @@ cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void
return cudaErrorDevicesUnavailable;
}

cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost);

return return_value;
}

Expand Down Expand Up @@ -483,8 +490,6 @@ void parse_ptx_string(void *fatCubin, const char *ptx_string, unsigned long long
if (type_size == 0)
continue;
arg_size = type_size;

std::cout << "arg size: " << arg_size << std::endl;
}
else if (ptx_string[i] == '[')
{
Expand Down Expand Up @@ -705,6 +710,8 @@ extern "C"
{
void *return_value;

std::cout << "calling __cudaRegisterVar" << std::endl;

// Start the RPC request
int request_id = rpc_start_request(0, RPC___cudaRegisterVar);
if (request_id < 0)
Expand Down Expand Up @@ -792,3 +799,36 @@ extern "C"
}
}
}

cudaError_t cudaFree(void* devPtr)
{
cudaError_t return_value;
maybe_free_unified_mem(0, devPtr);

if (rpc_start_request(0, RPC_cudaFree) < 0 ||
rpc_write(0, &devPtr, sizeof(void*)) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_end_response(0, &return_value) < 0)
return cudaErrorDevicesUnavailable;

return return_value;
}

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
void* d_mem;

cudaError_t err = cudaMalloc((void**)&d_mem, size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
return err;
}

std::cout << "allocated unified device mem " << d_mem << " size: " << size << std::endl;

allocate_unified_mem_pointer(0, d_mem, size);

*devPtr = d_mem;

return cudaSuccess;
}
2 changes: 2 additions & 0 deletions codegen/manual_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

cudaError_t cudaFree(void* devPtr);
cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags);
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
Expand Down
Loading
Loading