Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cr: build checkpoint restart support for cuModules, cuFunctions and globals #56

Merged
merged 6 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ bin
submodules/cuda-gdb
submodules/lib
ckp
criu-ckp
.nfs*
*.rpm
*.test
*.testapp
tests/samples/matrixMul
tests/samples/bandwidthTest
tests/samples/nbody
.cache

# Auto-generated by rpcgen
cpu/cpu_rpc_prot_svc_mod.c
Expand Down
52 changes: 40 additions & 12 deletions cpu/cpu-server-driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
#include <cuda.h>

#include "cpu_rpc_prot.h"
#include "cpu-server-driver-hidden.h"
#include "cpu-common.h"
#include "cpu-utils.h"
#include "log.h"
#include "resource-mg.h"
#define WITH_RECORDER
Expand Down Expand Up @@ -36,40 +33,58 @@ int server_driver_init(int restore)

#include <cuda_runtime_api.h>

// Does not support checkpoint/restart yet
bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
{
RECORD_API(rpc_elf_load_1_argument);
RECORD_ARG(1, elf);
RECORD_ARG(2, module_key);
char *elf_copy = NULL;
LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
CUresult res;
CUmodule module = NULL;

GSCHED_RETAIN;
if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res);
*result = res;
return 1;
}

// FIXME: We have to copy the elf because libtirpc will free it after the
// call. This is pretty ugly and we should find a better solution.
if ((elf_copy = malloc(elf.mem_data_len)) == NULL) {
LOGE(LOG_ERROR, "could not allocate memory");
*result = 1;
}
if (memcpy(elf_copy, elf.mem_data_val, elf.mem_data_len) == NULL) {
LOGE(LOG_ERROR, "could not copy elf");
*result = 1;
}
arguments->arg1.mem_data_val = elf_copy;

// We add our module using module_key as key. This means a fatbinaryHandle on the client is translated
// to a CUmodule on the server.
if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) {
LOGE(LOG_ERROR, "resource_mg_create failed: %d", res);
*result = res;
return 1;
}

GSCHED_RELEASE;
LOGE(LOG_DEBUG, "->module: %p", module);
*result = 0;
RECORD_RESULT(integer, *result);
return 1;
}

// Does not support checkpoint/restart yet
// TODO: We should also remove associated function handles
bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
{
RECORD_API(ptr);
RECORD_SINGLE_ARG(elf_handle);
LOGE(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
CUmodule module = NULL;
CUresult res;


GSCHED_RETAIN;
if ((module = (CUmodule)resource_mg_get(&rm_modules, (void*)elf_handle)) == NULL) {
LOG(LOG_ERROR, "resource_mg_get failed");
*result = -1;
Expand All @@ -91,12 +106,13 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
*result = res;
return 1;
}
GSCHED_RELEASE;

*result = 0;
RECORD_RESULT(integer, *result);
return 1;
}

// Does not support checkpoint/restart yet
bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
{
Expand All @@ -109,8 +125,15 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
RECORD_ARG(5, thread_limit);
LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
record->data_size = strlen(deviceFun) + 1 + strlen(deviceName) + 1;
if ((record->data = malloc(record->data_size)) == NULL) {
LOGE(LOG_ERROR, "could not allocate memory");
return 1;
}
strcpy(record->data, deviceFun);
strcpy(record->data + strlen(deviceFun) + 1, deviceName);

GSCHED_RETAIN;
//resource_mg_print(&rm_modules);
if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
result->err = -1;
Expand All @@ -127,7 +150,6 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
return 1;
}

// Does not support checkpoint/restart yet
bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress, char *deviceName, int ext, size_t size,
int constant, int global, int *result, struct svc_req *rqstp)
{
Expand All @@ -144,7 +166,13 @@ bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress
LOG(LOG_DEBUG, "rpc_register_var(fatCubinHandle: %p, hostVar: %p, deviceAddress: %p, deviceName: %s, "
"ext: %d, size: %d, constant: %d, global: %d)",
fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);

record->data_size = strlen(deviceName) + 1;
if ((record->data = malloc(record->data_size)) == NULL) {
LOGE(LOG_ERROR, "could not allocate memory");
return 1;
}
strcpy(record->data, deviceName);

CUdeviceptr dptr = 0;
size_t d_size = 0;
CUresult res;
Expand Down
16 changes: 13 additions & 3 deletions cpu/cpu-server-runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,13 @@ int server_runtime_init(int restore)
ret &= server_runtime_restore("ckp");
ret &= cublaslt_init(0, &rm_memory);
}


return ret;
}

int server_runtime_init_cuda(void)
{
int ret = 0;
// Make sure runtime API is initialized
// If we don't do this and use the driver API, it might be unintialized
cudaError_t cres;
Expand All @@ -96,7 +102,6 @@ int server_runtime_init(int restore)
ret = 1;
}
cudaDeviceSynchronize();

return ret;
}

Expand Down Expand Up @@ -125,6 +130,10 @@ int server_runtime_checkpoint(const char *path, int dump_memory, unsigned long p
return 1;
}
if (dump_memory == 1) {
if (cr_dump_elfs(path) != 0) {
LOGE(LOG_ERROR, "error dumping elfs");
return 1;
}
if (cr_dump_memory(path) != 0) {
LOGE(LOG_ERROR, "error dumping memory");
return 1;
Expand All @@ -138,7 +147,8 @@ int server_runtime_restore(const char *path)
struct timeval start, end;
double time = 0;
gettimeofday(&start, NULL);
if (cr_restore(path, &rm_memory, &rm_streams, &rm_events, &rm_arrays, cusolver_get_rm(), cublas_get_rm()) != 0) {
if (cr_restore(path, &rm_memory, &rm_streams, &rm_events, &rm_arrays,
cusolver_get_rm(), cublas_get_rm(), &rm_modules) != 0) {
LOGE(LOG_ERROR, "error restoring api_records");
return 1;
}
Expand Down
1 change: 1 addition & 0 deletions cpu/cpu-server-runtime.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _CPU_SERVER_RUNTIME_H_
#define _CPU_SERVER_RUNTIME_H_

int server_runtime_init_cuda(void);
int server_runtime_init(int restore);
int server_runtime_deinit(void);
int server_runtime_checkpoint(const char *path, int dump_memory, unsigned long prog, unsigned long vers);
Expand Down
14 changes: 9 additions & 5 deletions cpu/cpu-server.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ void cricket_main(size_t prog_num, size_t vers_num)
restore = 1;
}

if (server_runtime_init_cuda() != 0) {
LOGE(LOG_ERROR, "initializing server_runtime failed.");
goto cleanup4;
}
if (restore == 1) {
if (cr_restore_rpc_id("ckp", &prog, &vers) != 0) {
LOGE(LOG_ERROR, "error while restoring rpc id");
Expand Down Expand Up @@ -281,16 +285,16 @@ void cricket_main(size_t prog_num, size_t vers_num)
goto cleanup4;
}

if (server_runtime_init(restore) != 0) {
if (server_driver_init(restore) != 0) {
LOGE(LOG_ERROR, "initializing server_runtime failed.");
goto cleanup3;
goto cleanup2;
}

if (server_driver_init(restore) != 0) {
if (server_runtime_init(restore) != 0) {
LOGE(LOG_ERROR, "initializing server_runtime failed.");
goto cleanup2;
goto cleanup3;
}

if (server_nvml_init(restore) != 0) {
LOGE(LOG_ERROR, "initializing server_nvml failed.");
goto cleanup1;
Expand Down
Loading