RWTH-ACS · n-eiling · Dec 29, 2024 · Dec 20, 2024 · Dec 29, 2024 · Dec 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -23,13 +23,15 @@ bin
 submodules/cuda-gdb
 submodules/lib
 ckp
+criu-ckp
 .nfs*
 *.rpm
 *.test
 *.testapp
 tests/samples/matrixMul
 tests/samples/bandwidthTest
 tests/samples/nbody
+.cache
 
 # Auto-generated by rpcgen
 cpu/cpu_rpc_prot_svc_mod.c

diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
@@ -4,9 +4,6 @@
 #include <cuda.h>
 
 #include "cpu_rpc_prot.h"
-#include "cpu-server-driver-hidden.h"
-#include "cpu-common.h"
-#include "cpu-utils.h"
 #include "log.h"
 #include "resource-mg.h"
 #define WITH_RECORDER
@@ -36,40 +33,58 @@ int server_driver_init(int restore)
 
 #include <cuda_runtime_api.h>
 
-// Does not support checkpoint/restart yet
 bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
 {
+    RECORD_API(rpc_elf_load_1_argument);
+    RECORD_ARG(1, elf);
+    RECORD_ARG(2, module_key);
+    char *elf_copy = NULL;
     LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
     CUresult res;
     CUmodule module = NULL;
-
+    GSCHED_RETAIN;
     if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
         LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res);
         *result = res;
         return 1;
     }
 
+    // FIXME: We have to copy the elf because libtirpc will free it after the
+    // call. This is pretty ugly and we should find a better solution.
+    if ((elf_copy = malloc(elf.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "could not allocate memory");
+        *result = 1;
+    }
+    if (memcpy(elf_copy, elf.mem_data_val, elf.mem_data_len) == NULL) {
+        LOGE(LOG_ERROR, "could not copy elf");
+        *result = 1;
+    }
+    arguments->arg1.mem_data_val = elf_copy;
+
     // We add our module using module_key as key. This means a fatbinaryHandle on the client is translated
     // to a CUmodule on the server.
     if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) {
         LOGE(LOG_ERROR, "resource_mg_create failed: %d", res);
         *result = res;
         return 1;
     }
-
+    GSCHED_RELEASE;
     LOGE(LOG_DEBUG, "->module: %p", module);
     *result = 0;
+    RECORD_RESULT(integer, *result);
     return 1;
 }
 
-// Does not support checkpoint/restart yet
 // TODO: We should also remove associated function handles
 bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
 {
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(elf_handle);
     LOGE(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
     CUmodule module = NULL;
     CUresult res;
-
+
+    GSCHED_RETAIN;
     if ((module = (CUmodule)resource_mg_get(&rm_modules, (void*)elf_handle)) == NULL) {
         LOG(LOG_ERROR, "resource_mg_get failed");
         *result = -1;
@@ -91,12 +106,13 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
         *result = res;
         return 1;
     }
+    GSCHED_RELEASE;
 
     *result = 0;
+    RECORD_RESULT(integer, *result);
     return 1;
 }
 
-// Does not support checkpoint/restart yet
 bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
                             char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
 {
@@ -109,8 +125,15 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     RECORD_ARG(5, thread_limit);
     LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
         fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    record->data_size = strlen(deviceFun) + 1 + strlen(deviceName) + 1;
+    if ((record->data = malloc(record->data_size)) == NULL) {
+        LOGE(LOG_ERROR, "could not allocate memory");
+        return 1;
+    }
+    strcpy(record->data, deviceFun);
+    strcpy(record->data + strlen(deviceFun) + 1, deviceName);
+
     GSCHED_RETAIN;
-    //resource_mg_print(&rm_modules);
     if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
         LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
         result->err = -1;
@@ -127,7 +150,6 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     return 1;
 }
 
-// Does not support checkpoint/restart yet
 bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress, char *deviceName, int ext, size_t size,
                         int constant, int global, int *result, struct svc_req *rqstp)
 {
@@ -144,7 +166,13 @@ bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress
     LOG(LOG_DEBUG, "rpc_register_var(fatCubinHandle: %p, hostVar: %p, deviceAddress: %p, deviceName: %s, "
                    "ext: %d, size: %d, constant: %d, global: %d)",
                    fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
-
+    record->data_size = strlen(deviceName) + 1;
+    if ((record->data = malloc(record->data_size)) == NULL) {
+        LOGE(LOG_ERROR, "could not allocate memory");
+        return 1;
+    }
+    strcpy(record->data, deviceName);
+
     CUdeviceptr dptr = 0;
     size_t d_size = 0;
     CUresult res;

diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
@@ -87,7 +87,13 @@ int server_runtime_init(int restore)
         ret &= server_runtime_restore("ckp");
 	    ret &= cublaslt_init(0, &rm_memory);
     }
-
+
+    return ret;
+}
+
+int server_runtime_init_cuda(void)
+{
+    int ret = 0;
     // Make sure runtime API is initialized
     // If we don't do this and use the driver API, it might be unintialized
     cudaError_t cres;
@@ -96,7 +102,6 @@ int server_runtime_init(int restore)
         ret = 1;
     }
     cudaDeviceSynchronize();
-
     return ret;
 }
 
@@ -125,6 +130,10 @@ int server_runtime_checkpoint(const char *path, int dump_memory, unsigned long p
         return 1;
     }
     if (dump_memory == 1) {
+        if (cr_dump_elfs(path) != 0) {
+            LOGE(LOG_ERROR, "error dumping elfs");
+            return 1;
+        }
         if (cr_dump_memory(path) != 0) {
             LOGE(LOG_ERROR, "error dumping memory");
             return 1;
@@ -138,7 +147,8 @@ int server_runtime_restore(const char *path)
     struct timeval start, end;
     double time = 0;
     gettimeofday(&start, NULL);
-    if (cr_restore(path, &rm_memory, &rm_streams, &rm_events, &rm_arrays, cusolver_get_rm(), cublas_get_rm()) != 0) {
+    if (cr_restore(path, &rm_memory, &rm_streams, &rm_events, &rm_arrays,
+                   cusolver_get_rm(), cublas_get_rm(), &rm_modules) != 0) {
         LOGE(LOG_ERROR, "error restoring api_records");
         return 1;
     }

diff --git a/cpu/cpu-server-runtime.h b/cpu/cpu-server-runtime.h
@@ -1,6 +1,7 @@
 #ifndef _CPU_SERVER_RUNTIME_H_
 #define _CPU_SERVER_RUNTIME_H_
 
+int server_runtime_init_cuda(void);
 int server_runtime_init(int restore);
 int server_runtime_deinit(void);
 int server_runtime_checkpoint(const char *path, int dump_memory, unsigned long prog, unsigned long vers);

diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
@@ -208,6 +208,10 @@ void cricket_main(size_t prog_num, size_t vers_num)
             restore = 1;
     }
 
+    if (server_runtime_init_cuda() != 0) {
+        LOGE(LOG_ERROR, "initializing server_runtime failed.");
+        goto cleanup4;
+    }
     if (restore == 1) {
         if (cr_restore_rpc_id("ckp", &prog, &vers) != 0) {
             LOGE(LOG_ERROR, "error while restoring rpc id");
@@ -281,16 +285,16 @@ void cricket_main(size_t prog_num, size_t vers_num)
         goto cleanup4;
     }
 
-    if (server_runtime_init(restore) != 0) {
+    if (server_driver_init(restore) != 0) {
         LOGE(LOG_ERROR, "initializing server_runtime failed.");
-        goto cleanup3;
+        goto cleanup2;
     }
 
-    if (server_driver_init(restore) != 0) {
+    if (server_runtime_init(restore) != 0) {
         LOGE(LOG_ERROR, "initializing server_runtime failed.");
-        goto cleanup2;        
+        goto cleanup3;
     }
-    
+
     if (server_nvml_init(restore) != 0) {
         LOGE(LOG_ERROR, "initializing server_nvml failed.");
         goto cleanup1;