From da5e4080d73746b280d10ffac6e3ae6838b76203 Mon Sep 17 00:00:00 2001 From: Brodey Newman Date: Mon, 23 Dec 2024 05:37:07 +0000 Subject: [PATCH] chore: dynamic unified pointers --- client.cpp | 22 + codegen/codegen.py | 42 +- codegen/gen_client.cpp | 14354 ++++++++++++++++++++++++++++++++++++++- local.sh | 1 + test/cudnn_managed.cu | 83 + 5 files changed, 14497 insertions(+), 5 deletions(-) create mode 100644 test/cudnn_managed.cu diff --git a/client.cpp b/client.cpp index 14def33..96109bb 100644 --- a/client.cpp +++ b/client.cpp @@ -96,6 +96,28 @@ static void segfault(int sig, siginfo_t* info, void* unused) { raise(SIGSEGV); } +void maybe_copy_unified_arg(const int index, void* arg, enum cudaMemcpyKind kind) +{ + auto& unified_devices = conns[index].unified_devices; + auto found = unified_devices.find(arg); + if (found != unified_devices.end()) + { + std::cout << "found unified arg pointer; copying..." << std::endl; + + void* ptr = found->first; + size_t size = found->second; + + cudaError_t res = cudaMemcpy(ptr, ptr, size, kind); + + if (res != cudaSuccess) { + std::cerr << "cudaMemcpy failed: " << cudaGetErrorString(res) << std::endl; + } else { + std::cout << "Successfully copied " << size << " bytes" << std::endl; + } + } +} + + static void set_segfault_handlers() { if (init > 0) { return; diff --git a/codegen/codegen.py b/codegen/codegen.py index 14d7b84..d4c84eb 100644 --- a/codegen/codegen.py +++ b/codegen/codegen.py @@ -104,6 +104,9 @@ def client_rpc_write(self, f): ) ) + def client_unified_copy(self, f, direction): + f.write(" maybe_copy_unified_arg(0, (void*){name}, cudaMemcpyDeviceToHost);\n".format(name=self.parameter.name, direction=direction)) + @property def server_declaration(self) -> str: c = self.ptr.ptr_to.const @@ -209,7 +212,24 @@ def client_rpc_write(self, f): length=length, ) ) - + + def client_unified_copy(self, f, direction): + f.write(" maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction)) + + if isinstance(self.length, int): + f.write(" for (int i = 0; i < {name}; i++)\n".format(name=self.length)) + f.write(" maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction)) + else: + if hasattr(self.length.type, "ptr_to"): + f.write(" for (int i = 0; i < static_cast(*{name}); i++)\n".format(name=self.length.name)) + f.write(" maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction)) + else: + if hasattr(self.parameter.type, "ptr_to"): + f.write(" for (int i = 0; i < static_cast({name}); i++)\n".format(name=self.length.name)) + f.write(" maybe_copy_unified_arg(0, (void*)&{name}[i], {direction});\n".format(name=self.parameter.name, direction=direction)) + else: + f.write(" for (int i = 0; i < static_cast({name}); i++)\n".format(name=self.length.name)) + f.write(" maybe_copy_unified_arg(0, (void*){name}[i], {direction});\n".format(name=self.parameter.name, direction=direction)) @property def server_declaration(self) -> str: @@ -330,6 +350,9 @@ def client_rpc_write(self, f): def server_declaration(self) -> str: return f" {self.ptr.format()} {self.parameter.name};\n" + \ f" std::size_t {self.parameter.name}_len;\n" + + def client_unified_copy(self, f, direction): + f.write(" maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction)) def server_rpc_read(self, f, index) -> Optional[str]: if not self.send: @@ -415,6 +438,12 @@ def server_declaration(self) -> str: return f" {self.type_.format().replace("const", "")} {self.parameter.name};\n" else: return f" {self.type_.format()} {self.parameter.name};\n" + + def client_unified_copy(self, f, direction): + if isinstance(self.type_, Pointer): + f.write(" maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction)) + else: + f.write(" maybe_copy_unified_arg(0, (void*)&{name}, {direction});\n".format(name=self.parameter.name, direction=direction)) def server_rpc_read(self, f): if not self.send: @@ -486,6 +515,9 @@ def server_rpc_read(self, f): param_type=self.type_.ptr_to.format(), ) ) + + def client_unified_copy(self, f, direction): + f.write(" maybe_copy_unified_arg(0, (void*){name}, {direction});\n".format(name=self.parameter.name, direction=direction)) @property def server_reference(self) -> str: @@ -761,6 +793,7 @@ def main(): "extern int rpc_wait_for_response(const int index);\n" "extern int rpc_read(const int index, void *data, const std::size_t size);\n" "extern int rpc_end_response(const int index, void *return_value);\n" + "void maybe_copy_unified_arg(const int index, void *arg, enum cudaMemcpyKind kind);\n" "extern int rpc_close();\n\n" ) for function, annotation, operations, disabled in functions_with_annotations: @@ -798,6 +831,9 @@ def main(): ) f.write("{\n") + for operation in operations: + operation.client_unified_copy(f, "cudaMemcpyHostToDevice") + f.write( " {return_type} return_value;\n".format( return_type=function.return_type.format() @@ -841,12 +877,14 @@ def main(): ) ) + for operation in operations: + operation.client_unified_copy(f, "cudaMemcpyDeviceToHost") + if function.name.format() == "nvmlShutdown": f.write(" if (rpc_close() < 0)\n") f.write(" return {error_return};\n".format(error_return=error_const(function.return_type.format()))) f.write(" return return_value;\n") - f.write("}\n\n") f.write("std::unordered_map functionMap = {\n") diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp index 7b2dca9..ff768f4 100644 --- a/codegen/gen_client.cpp +++ b/codegen/gen_client.cpp @@ -19,7 +19,7 @@ extern int rpc_end_request(const int index); extern int rpc_wait_for_response(const int index); extern int rpc_read(const int index, void *data, const std::size_t size); extern int rpc_end_response(const int index, void *return_value); -void cuda_memcpy_unified_ptrs(const int index, cudaMemcpyKind kind); +void maybe_copy_unified_arg(const int index, void *arg, enum cudaMemcpyKind kind); extern int rpc_close(); nvmlReturn_t nvmlInit_v2() @@ -34,12 +34,14 @@ nvmlReturn_t nvmlInit_v2() nvmlReturn_t nvmlInitWithFlags(unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlInitWithFlags) < 0 || rpc_write(0, &flags, sizeof(unsigned int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } @@ -57,6 +59,10 @@ nvmlReturn_t nvmlShutdown() nvmlReturn_t nvmlSystemGetDriverVersion(char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetDriverVersion) < 0 || rpc_write(0, &length, sizeof(unsigned int)) < 0 || @@ -64,11 +70,19 @@ nvmlReturn_t nvmlSystemGetDriverVersion(char* version, unsigned int length) rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetNVMLVersion(char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetNVMLVersion) < 0 || rpc_write(0, &length, sizeof(unsigned int)) < 0 || @@ -76,33 +90,46 @@ nvmlReturn_t nvmlSystemGetNVMLVersion(char* version, unsigned int length) rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetCudaDriverVersion(int* cudaDriverVersion) { + maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetCudaDriverVersion) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, cudaDriverVersion, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetCudaDriverVersion_v2(int* cudaDriverVersion) { + maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetCudaDriverVersion_v2) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, cudaDriverVersion, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)cudaDriverVersion, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char* name, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetProcessName) < 0 || rpc_write(0, &pid, sizeof(unsigned int)) < 0 || @@ -111,22 +138,31 @@ nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char* name, unsigned int rpc_read(0, name, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetCount(unsigned int* unitCount) { + maybe_copy_unified_arg(0, (void*)unitCount, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetCount) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, unitCount, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)unitCount, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t* unit) { + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)unit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetHandleByIndex) < 0 || rpc_write(0, &index, sizeof(unsigned int)) < 0 || @@ -134,11 +170,15 @@ nvmlReturn_t nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t* unit) rpc_read(0, unit, sizeof(nvmlUnit_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)unit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetUnitInfo) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -146,11 +186,15 @@ nvmlReturn_t nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t* info) rpc_read(0, info, sizeof(nvmlUnitInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t* state) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)state, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetLedState) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -158,11 +202,15 @@ nvmlReturn_t nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t* state) rpc_read(0, state, sizeof(nvmlLedState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)state, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t* psu) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)psu, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetPsuInfo) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -170,11 +218,16 @@ nvmlReturn_t nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t* psu) rpc_read(0, psu, sizeof(nvmlPSUInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)psu, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int* temp) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetTemperature) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -183,11 +236,16 @@ nvmlReturn_t nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned rpc_read(0, temp, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t* fanSpeeds) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fanSpeeds, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetFanSpeedInfo) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -195,11 +253,18 @@ nvmlReturn_t nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t* fanSp rpc_read(0, fanSpeeds, sizeof(nvmlUnitFanSpeeds_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fanSpeeds, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int* deviceCount, nvmlDevice_t* devices) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devices, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*deviceCount); i++) + maybe_copy_unified_arg(0, (void*)&devices[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitGetDevices) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -209,11 +274,20 @@ nvmlReturn_t nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int* deviceCount, nvml rpc_read(0, devices, *deviceCount * sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devices, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*deviceCount); i++) + maybe_copy_unified_arg(0, (void*)&devices[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetHicVersion(unsigned int* hwbcCount, nvmlHwbcEntry_t* hwbcEntries) { + maybe_copy_unified_arg(0, (void*)hwbcCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)hwbcEntries, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*hwbcCount); i++) + maybe_copy_unified_arg(0, (void*)&hwbcEntries[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetHicVersion) < 0 || rpc_write(0, hwbcCount, sizeof(unsigned int)) < 0 || @@ -222,22 +296,30 @@ nvmlReturn_t nvmlSystemGetHicVersion(unsigned int* hwbcCount, nvmlHwbcEntry_t* h rpc_read(0, hwbcEntries, *hwbcCount * sizeof(nvmlHwbcEntry_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)hwbcCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)hwbcEntries, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*hwbcCount); i++) + maybe_copy_unified_arg(0, (void*)&hwbcEntries[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount) { + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCount_v2) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, deviceCount, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t* attributes) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAttributes_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -245,11 +327,15 @@ nvmlReturn_t nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttribute rpc_read(0, attributes, sizeof(nvmlDeviceAttributes_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* device) { + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByIndex_v2) < 0 || rpc_write(0, &index, sizeof(unsigned int)) < 0 || @@ -257,11 +343,15 @@ nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t* dev rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetHandleBySerial(const char* serial, nvmlDevice_t* device) { + maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; std::size_t serial_len = std::strlen(serial) + 1; if (rpc_start_request(0, RPC_nvmlDeviceGetHandleBySerial) < 0 || @@ -271,11 +361,15 @@ nvmlReturn_t nvmlDeviceGetHandleBySerial(const char* serial, nvmlDevice_t* devic rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetHandleByUUID(const char* uuid, nvmlDevice_t* device) { + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; std::size_t uuid_len = std::strlen(uuid) + 1; if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByUUID) < 0 || @@ -285,11 +379,15 @@ nvmlReturn_t nvmlDeviceGetHandleByUUID(const char* uuid, nvmlDevice_t* device) rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char* pciBusId, nvmlDevice_t* device) { + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; std::size_t pciBusId_len = std::strlen(pciBusId) + 1; if (rpc_start_request(0, RPC_nvmlDeviceGetHandleByPciBusId_v2) < 0 || @@ -299,11 +397,18 @@ nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char* pciBusId, nvmlDevice_t rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char* name, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetName) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -312,11 +417,18 @@ nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char* name, unsigned int len rpc_read(0, name, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBrand) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -324,11 +436,15 @@ nvmlReturn_t nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t* type) rpc_read(0, type, sizeof(nvmlBrandType_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)index, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetIndex) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -336,11 +452,18 @@ nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) rpc_read(0, index, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)index, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSerial(nvmlDevice_t device, char* serial, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&serial[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSerial) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -349,11 +472,22 @@ nvmlReturn_t nvmlDeviceGetSerial(nvmlDevice_t device, char* serial, unsigned int rpc_read(0, serial, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)serial, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&serial[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long* nodeSet, nvmlAffinityScope_t scope) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nodeSetSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeSet, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(nodeSetSize); i++) + maybe_copy_unified_arg(0, (void*)&nodeSet[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryAffinity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -363,11 +497,23 @@ nvmlReturn_t nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeS rpc_read(0, nodeSet, nodeSetSize * sizeof(unsigned long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nodeSetSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeSet, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(nodeSetSize); i++) + maybe_copy_unified_arg(0, (void*)&nodeSet[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long* cpuSet, nvmlAffinityScope_t scope) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(cpuSetSize); i++) + maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCpuAffinityWithinScope) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -377,11 +523,22 @@ nvmlReturn_t nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned i rpc_read(0, cpuSet, cpuSetSize * sizeof(unsigned long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(cpuSetSize); i++) + maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long* cpuSet) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(cpuSetSize); i++) + maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCpuAffinity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -390,33 +547,45 @@ nvmlReturn_t nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSi rpc_read(0, cpuSet, cpuSetSize * sizeof(unsigned long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cpuSetSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)cpuSet, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(cpuSetSize); i++) + maybe_copy_unified_arg(0, (void*)&cpuSet[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetCpuAffinity(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetCpuAffinity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceClearCpuAffinity(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceClearCpuAffinity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t* pathInfo) { + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pathInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTopologyCommonAncestor) < 0 || rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 || @@ -425,11 +594,20 @@ nvmlReturn_t nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevic rpc_read(0, pathInfo, sizeof(nvmlGpuTopologyLevel_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pathInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int* count, nvmlDevice_t* deviceArray) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTopologyNearestGpus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -440,11 +618,22 @@ nvmlReturn_t nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopolo rpc_read(0, deviceArray, *count * sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int* count, nvmlDevice_t* deviceArray) { + maybe_copy_unified_arg(0, (void*)&cpuNumber, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSystemGetTopologyGpuSet) < 0 || rpc_write(0, &cpuNumber, sizeof(unsigned int)) < 0 || @@ -454,11 +643,20 @@ nvmlReturn_t nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int* c rpc_read(0, deviceArray, *count * sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&cpuNumber, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)deviceArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&deviceArray[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus) { + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&p2pIndex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)p2pStatus, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetP2PStatus) < 0 || rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 || @@ -468,11 +666,20 @@ nvmlReturn_t nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, rpc_read(0, p2pStatus, sizeof(nvmlGpuP2PStatus_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&p2pIndex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)p2pStatus, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char* uuid, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetUUID) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -481,11 +688,21 @@ nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char* uuid, unsigned int len rpc_read(0, uuid, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char* mdevUuid, unsigned int size) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mdevUuid, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&mdevUuid[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetMdevUUID) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -494,11 +711,18 @@ nvmlReturn_t nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char* rpc_read(0, mdevUuid, size * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mdevUuid, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&mdevUuid[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minorNumber, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMinorNumber) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -506,11 +730,18 @@ nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNu rpc_read(0, minorNumber, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minorNumber, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)partNumber, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&partNumber[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBoardPartNumber) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -519,11 +750,22 @@ nvmlReturn_t nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, rpc_read(0, partNumber, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)partNumber, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&partNumber[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetInforomVersion) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -533,11 +775,22 @@ nvmlReturn_t nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_ rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetInforomImageVersion) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -546,11 +799,18 @@ nvmlReturn_t nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char* version rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int* checksum) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)checksum, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetInforomConfigurationChecksum) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -558,22 +818,28 @@ nvmlReturn_t nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsi rpc_read(0, checksum, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)checksum, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceValidateInforom(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceValidateInforom) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t* display) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)display, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDisplayMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -581,11 +847,15 @@ nvmlReturn_t nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t* di rpc_read(0, display, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)display, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t* isActive) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDisplayActive) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -593,11 +863,15 @@ nvmlReturn_t nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t* rpc_read(0, isActive, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t* mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPersistenceMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -605,11 +879,15 @@ nvmlReturn_t nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t* pci) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPciInfo_v3) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -617,11 +895,15 @@ nvmlReturn_t nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t* pci) rpc_read(0, pci, sizeof(nvmlPciInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int* maxLinkGen) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxLinkGen, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMaxPcieLinkGeneration) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -629,11 +911,15 @@ nvmlReturn_t nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned in rpc_read(0, maxLinkGen, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxLinkGen, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int* maxLinkGenDevice) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxLinkGenDevice, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuMaxPcieLinkGeneration) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -641,11 +927,15 @@ nvmlReturn_t nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned rpc_read(0, maxLinkGenDevice, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxLinkGenDevice, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int* maxLinkWidth) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxLinkWidth, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMaxPcieLinkWidth) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -653,11 +943,15 @@ nvmlReturn_t nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int* ma rpc_read(0, maxLinkWidth, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxLinkWidth, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int* currLinkGen) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)currLinkGen, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCurrPcieLinkGeneration) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -665,11 +959,15 @@ nvmlReturn_t nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned i rpc_read(0, currLinkGen, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)currLinkGen, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int* currLinkWidth) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)currLinkWidth, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCurrPcieLinkWidth) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -677,11 +975,16 @@ nvmlReturn_t nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int* c rpc_read(0, currLinkWidth, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)currLinkWidth, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int* value) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPcieThroughput) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -690,11 +993,16 @@ nvmlReturn_t nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounte rpc_read(0, value, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int* value) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPcieReplayCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -702,11 +1010,16 @@ nvmlReturn_t nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int* v rpc_read(0, value, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetClockInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -715,11 +1028,17 @@ nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, u rpc_read(0, clock, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMaxClockInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -728,11 +1047,17 @@ nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type rpc_read(0, clock, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clock, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetApplicationsClock) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -741,11 +1066,17 @@ nvmlReturn_t nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDefaultApplicationsClock) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -754,22 +1085,31 @@ nvmlReturn_t nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlCloc rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceResetApplicationsClocks(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceResetApplicationsClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int* clockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clockId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetClock) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -779,11 +1119,18 @@ nvmlReturn_t nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clockId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int* clockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMaxCustomerBoostClock) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -792,11 +1139,19 @@ nvmlReturn_t nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockTy rpc_read(0, clockMHz, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clockType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int* count, unsigned int* clocksMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedMemoryClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -806,11 +1161,22 @@ nvmlReturn_t nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned in rpc_read(0, clocksMHz, *count * sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int* count, unsigned int* clocksMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memoryClockMHz, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedGraphicsClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -821,11 +1187,20 @@ nvmlReturn_t nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned rpc_read(0, clocksMHz, *count * sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memoryClockMHz, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clocksMHz, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&clocksMHz[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t* isEnabled, nvmlEnableState_t* defaultIsEnabled) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)defaultIsEnabled, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAutoBoostedClocksEnabled) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -834,11 +1209,16 @@ nvmlReturn_t nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnab rpc_read(0, defaultIsEnabled, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)defaultIsEnabled, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetAutoBoostedClocksEnabled) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -846,11 +1226,16 @@ nvmlReturn_t nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnab rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetDefaultAutoBoostedClocksEnabled) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -859,11 +1244,16 @@ nvmlReturn_t nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, n rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&enabled, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int* speed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFanSpeed) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -871,11 +1261,16 @@ nvmlReturn_t nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int* speed) rpc_read(0, speed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int* speed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFanSpeed_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -884,11 +1279,17 @@ nvmlReturn_t nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, uns rpc_read(0, speed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)speed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int* targetSpeed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)targetSpeed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTargetFanSpeed) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -897,11 +1298,16 @@ nvmlReturn_t nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, rpc_read(0, targetSpeed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)targetSpeed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetDefaultFanSpeed_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -909,11 +1315,16 @@ nvmlReturn_t nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int f rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int* minSpeed, unsigned int* maxSpeed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minSpeed, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMinMaxFanSpeed) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -922,11 +1333,17 @@ nvmlReturn_t nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int* minS rpc_read(0, maxSpeed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minSpeed, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, nvmlFanControlPolicy_t* policy) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)policy, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFanControlPolicy_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -935,11 +1352,17 @@ nvmlReturn_t nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int rpc_read(0, policy, sizeof(nvmlFanControlPolicy_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)policy, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, nvmlFanControlPolicy_t policy) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&policy, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetFanControlPolicy) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -948,11 +1371,16 @@ nvmlReturn_t nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&policy, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int* numFans) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numFans, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNumFans) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -960,11 +1388,16 @@ nvmlReturn_t nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int* numFans) rpc_read(0, numFans, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numFans, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int* temp) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&sensorType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTemperature) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -973,11 +1406,17 @@ nvmlReturn_t nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensor rpc_read(0, temp, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&sensorType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int* temp) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTemperatureThreshold) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -986,11 +1425,17 @@ nvmlReturn_t nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperat rpc_read(0, temp, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int* temp) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetTemperatureThreshold) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1000,11 +1445,17 @@ nvmlReturn_t nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperat rpc_read(0, temp, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&thresholdType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)temp, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t* pThermalSettings) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&sensorIndex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pThermalSettings, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetThermalSettings) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1013,11 +1464,16 @@ nvmlReturn_t nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sens rpc_read(0, pThermalSettings, sizeof(nvmlGpuThermalSettings_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&sensorIndex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pThermalSettings, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t* pState) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPerformanceState) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1025,11 +1481,15 @@ nvmlReturn_t nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t* p rpc_read(0, pState, sizeof(nvmlPstates_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long* clocksThrottleReasons) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)clocksThrottleReasons, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCurrentClocksThrottleReasons) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1037,11 +1497,15 @@ nvmlReturn_t nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsi rpc_read(0, clocksThrottleReasons, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)clocksThrottleReasons, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long* supportedClocksThrottleReasons) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)supportedClocksThrottleReasons, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedClocksThrottleReasons) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1049,11 +1513,15 @@ nvmlReturn_t nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, un rpc_read(0, supportedClocksThrottleReasons, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)supportedClocksThrottleReasons, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t* pState) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerState) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1061,11 +1529,15 @@ nvmlReturn_t nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t* pState) rpc_read(0, pState, sizeof(nvmlPstates_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t* mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1073,11 +1545,15 @@ nvmlReturn_t nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableSta rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int* limit) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementLimit) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1085,11 +1561,16 @@ nvmlReturn_t nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int rpc_read(0, limit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int* minLimit, unsigned int* maxLimit) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minLimit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxLimit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementLimitConstraints) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1098,11 +1579,16 @@ nvmlReturn_t nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, u rpc_read(0, maxLimit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minLimit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxLimit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int* defaultLimit) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)defaultLimit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerManagementDefaultLimit) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1110,11 +1596,15 @@ nvmlReturn_t nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsig rpc_read(0, defaultLimit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)defaultLimit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int* power) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)power, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerUsage) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1122,11 +1612,15 @@ nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int* power) rpc_read(0, power, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)power, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long* energy) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)energy, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTotalEnergyConsumption) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1134,11 +1628,15 @@ nvmlReturn_t nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned l rpc_read(0, energy, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)energy, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int* limit) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEnforcedPowerLimit) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1146,11 +1644,16 @@ nvmlReturn_t nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int* rpc_read(0, limit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)limit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t* current, nvmlGpuOperationMode_t* pending) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuOperationMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1159,11 +1662,16 @@ nvmlReturn_t nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperation rpc_read(0, pending, sizeof(nvmlGpuOperationMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t* memory) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1171,11 +1679,15 @@ nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t* memory) rpc_read(0, memory, sizeof(nvmlMemory_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t* memory) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryInfo_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1183,11 +1695,15 @@ nvmlReturn_t nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t* me rpc_read(0, memory, sizeof(nvmlMemory_v2_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)memory, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t* mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetComputeMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1195,11 +1711,16 @@ nvmlReturn_t nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t* mo rpc_read(0, mode, sizeof(nvmlComputeMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)major, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCudaComputeCapability) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1208,11 +1729,17 @@ nvmlReturn_t nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, rpc_read(0, minor, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)major, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t* current, nvmlEnableState_t* pending) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEccMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1221,11 +1748,16 @@ nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t* curren rpc_read(0, pending, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t* defaultMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDefaultEccMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1233,11 +1765,15 @@ nvmlReturn_t nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t* rpc_read(0, defaultMode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int* boardId) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)boardId, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBoardId) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1245,11 +1781,15 @@ nvmlReturn_t nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int* boardId) rpc_read(0, boardId, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)boardId, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int* multiGpuBool) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)multiGpuBool, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMultiGpuBoard) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1257,11 +1797,17 @@ nvmlReturn_t nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int* multi rpc_read(0, multiGpuBool, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)multiGpuBool, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long* eccCounts) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetTotalEccErrors) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1271,11 +1817,19 @@ nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorTyp rpc_read(0, eccCounts, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t* eccCounts) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDetailedEccErrors) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1285,11 +1839,20 @@ nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryError rpc_read(0, eccCounts, sizeof(nvmlEccErrorCounts_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)eccCounts, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType, unsigned long long* count) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&locationType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryErrorCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1300,11 +1863,18 @@ nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErro rpc_read(0, count, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&errorType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&locationType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t* utilization) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetUtilizationRates) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1312,11 +1882,16 @@ nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_ rpc_read(0, utilization, sizeof(nvmlUtilization_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int* utilization, unsigned int* samplingPeriodUs) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderUtilization) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1325,11 +1900,17 @@ nvmlReturn_t nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int* rpc_read(0, samplingPeriodUs, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEncoderCapacity(nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int* encoderCapacity) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&encoderQueryType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderCapacity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1338,11 +1919,18 @@ nvmlReturn_t nvmlDeviceGetEncoderCapacity(nvmlDevice_t device, nvmlEncoderType_t rpc_read(0, encoderCapacity, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&encoderQueryType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEncoderStats(nvmlDevice_t device, unsigned int* sessionCount, unsigned int* averageFps, unsigned int* averageLatency) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderStats) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1352,11 +1940,20 @@ nvmlReturn_t nvmlDeviceGetEncoderStats(nvmlDevice_t device, unsigned int* sessio rpc_read(0, averageLatency, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int* sessionCount, nvmlEncoderSessionInfo_t* sessionInfos) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionInfos, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfos[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetEncoderSessions) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1366,11 +1963,19 @@ nvmlReturn_t nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int* ses rpc_read(0, sessionInfos, *sessionCount * sizeof(nvmlEncoderSessionInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionInfos, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfos[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int* utilization, unsigned int* samplingPeriodUs) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDecoderUtilization) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1379,11 +1984,16 @@ nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int* rpc_read(0, samplingPeriodUs, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)samplingPeriodUs, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t* fbcStats) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFBCStats) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1391,11 +2001,18 @@ nvmlReturn_t nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t* fbcStats rpc_read(0, fbcStats, sizeof(nvmlFBCStats_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int* sessionCount, nvmlFBCSessionInfo_t* sessionInfo) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFBCSessions) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1405,11 +2022,19 @@ nvmlReturn_t nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int* session rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlFBCSessionInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t* current, nvmlDriverModel_t* pending) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDriverModel) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1418,11 +2043,19 @@ nvmlReturn_t nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t* cu rpc_read(0, pending, sizeof(nvmlDriverModel_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pending, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVbiosVersion) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1431,11 +2064,18 @@ nvmlReturn_t nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char* version, unsig rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t* bridgeHierarchy) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bridgeHierarchy, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBridgeChipInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1443,11 +2083,18 @@ nvmlReturn_t nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHier rpc_read(0, bridgeHierarchy, sizeof(nvmlBridgeChipHierarchy_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bridgeHierarchy, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetComputeRunningProcesses_v3) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1457,11 +2104,21 @@ nvmlReturn_t nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsign rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGraphicsRunningProcesses_v3) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1471,11 +2128,21 @@ nvmlReturn_t nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsig rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_t* infos) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMPSComputeRunningProcesses_v3) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1485,11 +2152,19 @@ nvmlReturn_t nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, uns rpc_read(0, infos, *infoCount * sizeof(nvmlProcessInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infoCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)infos, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*infoCount); i++) + maybe_copy_unified_arg(0, (void*)&infos[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int* onSameBoard) { + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)onSameBoard, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceOnSameBoard) < 0 || rpc_write(0, &device1, sizeof(nvmlDevice_t)) < 0 || @@ -1498,11 +2173,17 @@ nvmlReturn_t nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, i rpc_read(0, onSameBoard, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device2, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)onSameBoard, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t* isRestricted) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isRestricted, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAPIRestriction) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1511,11 +2192,22 @@ nvmlReturn_t nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_ rpc_read(0, isRestricted, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isRestricted, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t* sampleValType, unsigned int* sampleCount, nvmlSample_t* samples) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sampleCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)samples, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*sampleCount); i++) + maybe_copy_unified_arg(0, (void*)&samples[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSamples) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1528,11 +2220,21 @@ nvmlReturn_t nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, rpc_read(0, samples, *sampleCount * sizeof(nvmlSample_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sampleCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)samples, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*sampleCount); i++) + maybe_copy_unified_arg(0, (void*)&samples[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t* bar1Memory) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bar1Memory, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBAR1MemoryInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1540,11 +2242,16 @@ nvmlReturn_t nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t* rpc_read(0, bar1Memory, sizeof(nvmlBAR1Memory_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bar1Memory, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t* violTime) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&perfPolicyType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)violTime, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetViolationStatus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1553,11 +2260,16 @@ nvmlReturn_t nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyTyp rpc_read(0, violTime, sizeof(nvmlViolationTime_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&perfPolicyType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)violTime, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int* irqNum) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)irqNum, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetIrqNum) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1565,11 +2277,15 @@ nvmlReturn_t nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int* irqNum) rpc_read(0, irqNum, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)irqNum, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int* numCores) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numCores, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNumGpuCores) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1577,11 +2293,15 @@ nvmlReturn_t nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int* numCore rpc_read(0, numCores, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numCores, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t* powerSource) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)powerSource, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPowerSource) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1589,11 +2309,15 @@ nvmlReturn_t nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t* po rpc_read(0, powerSource, sizeof(nvmlPowerSource_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)powerSource, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int* busWidth) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)busWidth, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemoryBusWidth) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1601,11 +2325,15 @@ nvmlReturn_t nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int* busW rpc_read(0, busWidth, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)busWidth, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int* maxSpeed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPcieLinkMaxSpeed) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1613,11 +2341,15 @@ nvmlReturn_t nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int* ma rpc_read(0, maxSpeed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxSpeed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int* pcieSpeed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pcieSpeed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPcieSpeed) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1625,11 +2357,15 @@ nvmlReturn_t nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int* pcieSpeed rpc_read(0, pcieSpeed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pcieSpeed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int* adaptiveClockStatus) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)adaptiveClockStatus, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAdaptiveClockInfoStatus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1637,11 +2373,15 @@ nvmlReturn_t nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned rpc_read(0, adaptiveClockStatus, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)adaptiveClockStatus, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t* mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1649,11 +2389,16 @@ nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t* rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t* stats) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingStats) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1662,11 +2407,19 @@ nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, rpc_read(0, stats, sizeof(nvmlAccountingStats_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int* count, unsigned int* pids) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingPids) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1676,11 +2429,18 @@ nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int* coun rpc_read(0, pids, *count * sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int* bufferSize) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetAccountingBufferSize) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1688,11 +2448,19 @@ nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int rpc_read(0, bufferSize, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, unsigned int* pageCount, unsigned long long* addresses) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPages) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1703,11 +2471,26 @@ nvmlReturn_t nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCa rpc_read(0, addresses, *pageCount * sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, unsigned int* pageCount, unsigned long long* addresses, unsigned long long* timestamps) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)timestamps, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)×tamps[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPages_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1719,11 +2502,22 @@ nvmlReturn_t nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetiremen rpc_read(0, timestamps, *pageCount * sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cause, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pageCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)addresses, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)&addresses[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)timestamps, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*pageCount); i++) + maybe_copy_unified_arg(0, (void*)×tamps[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t* isPending) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetRetiredPagesPendingStatus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1731,11 +2525,18 @@ nvmlReturn_t nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEna rpc_read(0, isPending, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int* corrRows, unsigned int* uncRows, unsigned int* isPending, unsigned int* failureOccurred) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)corrRows, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)uncRows, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)failureOccurred, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetRemappedRows) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1746,11 +2547,18 @@ nvmlReturn_t nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int* corrRo rpc_read(0, failureOccurred, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)corrRows, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)uncRows, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isPending, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)failureOccurred, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t* values) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetRowRemapperHistogram) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1758,11 +2566,15 @@ nvmlReturn_t nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemap rpc_read(0, values, sizeof(nvmlRowRemapperHistogramValues_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t* arch) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)arch, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetArchitecture) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1770,11 +2582,15 @@ nvmlReturn_t nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitectu rpc_read(0, arch, sizeof(nvmlDeviceArchitecture_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)arch, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color) { + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&color, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlUnitSetLedState) < 0 || rpc_write(0, &unit, sizeof(nvmlUnit_t)) < 0 || @@ -1782,11 +2598,15 @@ nvmlReturn_t nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&unit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&color, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetPersistenceMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1794,11 +2614,15 @@ nvmlReturn_t nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetComputeMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1806,11 +2630,15 @@ nvmlReturn_t nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mod rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ecc, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetEccMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1818,11 +2646,15 @@ nvmlReturn_t nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ecc, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceClearEccErrorCounts) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1830,11 +2662,16 @@ nvmlReturn_t nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterTy rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counterType, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&driverModel, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetDriverModel) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1843,11 +2680,17 @@ nvmlReturn_t nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t dri rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&driverModel, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&minGpuClockMHz, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&maxGpuClockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetGpuLockedClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1856,22 +2699,30 @@ nvmlReturn_t nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minG rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&minGpuClockMHz, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&maxGpuClockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceResetGpuLockedClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&minMemClockMHz, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&maxMemClockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetMemoryLockedClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1880,22 +2731,30 @@ nvmlReturn_t nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int m rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&minMemClockMHz, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&maxMemClockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceResetMemoryLockedClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memClockMHz, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graphicsClockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetApplicationsClocks) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1904,11 +2763,16 @@ nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int m rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memClockMHz, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graphicsClockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t* status) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)status, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetClkMonStatus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1916,11 +2780,15 @@ nvmlReturn_t nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t* rpc_read(0, status, sizeof(nvmlClkMonStatus_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)status, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetPowerManagementLimit) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1928,11 +2796,15 @@ nvmlReturn_t nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetGpuOperationMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1940,11 +2812,16 @@ nvmlReturn_t nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperation rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&isRestricted, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetAPIRestriction) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1953,11 +2830,16 @@ nvmlReturn_t nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_ rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&apiType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&isRestricted, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetAccountingMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1965,22 +2847,29 @@ nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceClearAccountingPids(nvmlDevice_t device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceClearAccountingPids) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkState) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -1989,11 +2878,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nv rpc_read(0, isActive, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isActive, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int* version) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkVersion) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2002,11 +2897,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, rpc_read(0, version, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkCapability) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2016,11 +2918,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int lin rpc_read(0, capResult, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkRemotePciInfo_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2029,11 +2938,18 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned i rpc_read(0, pci, sizeof(nvmlPciInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pci, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, nvmlNvLinkErrorCounter_t counter, unsigned long long* counterValue) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)counterValue, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkErrorCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2043,11 +2959,17 @@ nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int l rpc_read(0, counterValue, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)counterValue, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceResetNvLinkErrorCounters) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2055,11 +2977,18 @@ nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned in rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control, unsigned int reset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)control, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetNvLinkUtilizationControl) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2070,11 +2999,20 @@ nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)control, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)control, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkUtilizationControl) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2084,11 +3022,20 @@ nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned rpc_read(0, control, sizeof(nvmlNvLinkUtilizationControl_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)control, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long* rxcounter, unsigned long long* txcounter) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)rxcounter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)txcounter, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkUtilizationCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2099,11 +3046,20 @@ nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned rpc_read(0, txcounter, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)rxcounter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)txcounter, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&freeze, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceFreezeNvLinkUtilizationCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2113,11 +3069,18 @@ nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter(nvmlDevice_t device, unsig rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&freeze, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceResetNvLinkUtilizationCounter) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2126,11 +3089,17 @@ nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter(nvmlDevice_t device, unsign rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&counter, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t* pNvLinkDeviceType) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNvLinkDeviceType, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetNvLinkRemoteDeviceType) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2139,22 +3108,30 @@ nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned i rpc_read(0, pNvLinkDeviceType, sizeof(nvmlIntNvLinkDeviceType_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&link, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNvLinkDeviceType, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlEventSetCreate(nvmlEventSet_t* set) { + maybe_copy_unified_arg(0, (void*)set, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlEventSetCreate) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, set, sizeof(nvmlEventSet_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)set, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&eventTypes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceRegisterEvents) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2163,11 +3140,16 @@ nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long ev rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&eventTypes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long* eventTypes) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)eventTypes, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedEventTypes) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2175,11 +3157,16 @@ nvmlReturn_t nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long rpc_read(0, eventTypes, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)eventTypes, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t* data, unsigned int timeoutms) { + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&timeoutms, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlEventSetWait_v2) < 0 || rpc_write(0, &set, sizeof(nvmlEventSet_t)) < 0 || @@ -2188,22 +3175,29 @@ nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t* data, unsi rpc_read(0, data, sizeof(nvmlEventData_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&timeoutms, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set) { + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlEventSetFree) < 0 || rpc_write(0, &set, sizeof(nvmlEventSet_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&set, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t newState) { + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&newState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceModifyDrainState) < 0 || rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || @@ -2212,11 +3206,15 @@ nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_ rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&newState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t* currentState) { + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)currentState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceQueryDrainState) < 0 || rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || @@ -2225,11 +3223,16 @@ nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t* pciInfo, nvmlEnableState_t rpc_read(0, currentState, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)currentState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t* pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState) { + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gpuState, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&linkState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceRemoveGpu_v2) < 0 || rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || @@ -2239,11 +3242,15 @@ nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t* pciInfo, nvmlDetachGpuState_t rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gpuState, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&linkState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t* pciInfo) { + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceDiscoverGpus) < 0 || rpc_write(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || @@ -2251,11 +3258,17 @@ nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t* pciInfo) rpc_read(0, pciInfo, sizeof(nvmlPciInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)pciInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(valuesCount); i++) + maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetFieldValues) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2264,11 +3277,21 @@ nvmlReturn_t nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvml rpc_read(0, values, valuesCount * sizeof(nvmlFieldValue_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(valuesCount); i++) + maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(valuesCount); i++) + maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceClearFieldValues) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2277,11 +3300,18 @@ nvmlReturn_t nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nv rpc_read(0, values, valuesCount * sizeof(nvmlFieldValue_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&valuesCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)values, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(valuesCount); i++) + maybe_copy_unified_arg(0, (void*)&values[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t* pVirtualMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pVirtualMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVirtualizationMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2289,11 +3319,15 @@ nvmlReturn_t nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtual rpc_read(0, pVirtualMode, sizeof(nvmlGpuVirtualizationMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pVirtualMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t* pHostVgpuMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pHostVgpuMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetHostVgpuMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2301,11 +3335,15 @@ nvmlReturn_t nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t* rpc_read(0, pHostVgpuMode, sizeof(nvmlHostVgpuMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pHostVgpuMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&virtualMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetVirtualizationMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2313,11 +3351,15 @@ nvmlReturn_t nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtual rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&virtualMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t* pGridLicensableFeatures) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pGridLicensableFeatures, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGridLicensableFeatures_v4) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2325,11 +3367,19 @@ nvmlReturn_t nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGri rpc_read(0, pGridLicensableFeatures, sizeof(nvmlGridLicensableFeatures_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pGridLicensableFeatures, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t* utilization, unsigned int* processSamplesCount, unsigned long long lastSeenTimeStamp) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)processSamplesCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*processSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilization[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetProcessUtilization) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2340,11 +3390,19 @@ nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUti rpc_read(0, utilization, *processSamplesCount * sizeof(nvmlProcessUtilizationSample_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)processSamplesCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilization, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*processSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilization[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char* version) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGspFirmwareVersion) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2352,11 +3410,16 @@ nvmlReturn_t nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char* version) rpc_read(0, version, sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int* isEnabled, unsigned int* defaultMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGspFirmwareMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2365,11 +3428,16 @@ nvmlReturn_t nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int* isE rpc_read(0, defaultMode, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)defaultMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int* capResult) { + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGetVgpuDriverCapabilities) < 0 || rpc_write(0, &capability, sizeof(nvmlVgpuDriverCapability_t)) < 0 || @@ -2377,11 +3445,16 @@ nvmlReturn_t nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability rpc_read(0, capResult, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int* capResult) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuCapabilities) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2390,11 +3463,19 @@ nvmlReturn_t nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCa rpc_read(0, capResult, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuTypeId_t* vgpuTypeIds) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedVgpus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2404,11 +3485,21 @@ nvmlReturn_t nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int* vgpu rpc_read(0, vgpuTypeIds, *vgpuCount * sizeof(nvmlVgpuTypeId_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuTypeId_t* vgpuTypeIds) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetCreatableVgpus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2418,11 +3509,21 @@ nvmlReturn_t nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int* vgpu rpc_read(0, vgpuTypeIds, *vgpuCount * sizeof(nvmlVgpuTypeId_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeIds, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeIds[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeClass, unsigned int* size) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeClass, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeClass[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetClass) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2431,11 +3532,21 @@ nvmlReturn_t nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeCla rpc_read(0, vgpuTypeClass, *size * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeClass, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeClass[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeName, unsigned int* size) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeName, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeName[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetName) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2445,11 +3556,18 @@ nvmlReturn_t nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeName rpc_read(0, vgpuTypeName, *size * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeName, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeName[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* gpuInstanceProfileId) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuInstanceProfileId, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetGpuInstanceProfileId) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2457,11 +3575,16 @@ nvmlReturn_t nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, un rpc_read(0, gpuInstanceProfileId, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuInstanceProfileId, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long* deviceID, unsigned long long* subsystemID) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)deviceID, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)subsystemID, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetDeviceID) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2470,11 +3593,16 @@ nvmlReturn_t nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long rpc_read(0, subsystemID, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)deviceID, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)subsystemID, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long* fbSize) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fbSize, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetFramebufferSize) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2482,11 +3610,15 @@ nvmlReturn_t nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigne rpc_read(0, fbSize, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fbSize, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* numDisplayHeads) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numDisplayHeads, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetNumDisplayHeads) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2494,11 +3626,17 @@ nvmlReturn_t nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigne rpc_read(0, numDisplayHeads, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numDisplayHeads, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int* xdim, unsigned int* ydim) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&displayIndex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)xdim, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)ydim, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetResolution) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2508,11 +3646,20 @@ nvmlReturn_t nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int rpc_read(0, ydim, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&displayIndex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)xdim, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)ydim, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeLicenseString, unsigned int size) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeLicenseString, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeLicenseString[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetLicense) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2521,11 +3668,18 @@ nvmlReturn_t nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char* vgpuTypeL rpc_read(0, vgpuTypeLicenseString, size * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeLicenseString, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&vgpuTypeLicenseString[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* frameRateLimit) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetFrameRateLimit) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2533,11 +3687,16 @@ nvmlReturn_t nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned rpc_read(0, frameRateLimit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int* vgpuInstanceCount) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuInstanceCount, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetMaxInstances) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2546,11 +3705,16 @@ nvmlReturn_t nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t v rpc_read(0, vgpuInstanceCount, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuInstanceCount, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int* vgpuInstanceCountPerVm) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuInstanceCountPerVm, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetMaxInstancesPerVm) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2558,11 +3722,18 @@ nvmlReturn_t nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsig rpc_read(0, vgpuInstanceCountPerVm, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuInstanceCountPerVm, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int* vgpuCount, nvmlVgpuInstance_t* vgpuInstances) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuInstances, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuInstances[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetActiveVgpus) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2572,11 +3743,22 @@ nvmlReturn_t nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int* vgpuCou rpc_read(0, vgpuInstances, *vgpuCount * sizeof(nvmlVgpuInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuInstances, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*vgpuCount); i++) + maybe_copy_unified_arg(0, (void*)&vgpuInstances[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char* vmId, unsigned int size, nvmlVgpuVmIdType_t* vmIdType) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vmId, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&vmId[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vmIdType, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetVmID) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2586,11 +3768,22 @@ nvmlReturn_t nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char* vmId rpc_read(0, vmIdType, sizeof(nvmlVgpuVmIdType_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vmId, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&vmId[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vmIdType, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char* uuid, unsigned int size) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetUUID) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2599,11 +3792,21 @@ nvmlReturn_t nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char* uuid rpc_read(0, uuid, size * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetVmDriverVersion) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2612,11 +3815,18 @@ nvmlReturn_t nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, version, length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(length); i++) + maybe_copy_unified_arg(0, (void*)&version[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long* fbUsage) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fbUsage, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFbUsage) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2624,11 +3834,15 @@ nvmlReturn_t nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigne rpc_read(0, fbUsage, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fbUsage, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int* licensed) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)licensed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetLicenseStatus) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2636,11 +3850,15 @@ nvmlReturn_t nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, u rpc_read(0, licensed, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)licensed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t* vgpuTypeId) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuTypeId, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetType) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2648,11 +3866,15 @@ nvmlReturn_t nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTy rpc_read(0, vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuTypeId, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int* frameRateLimit) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFrameRateLimit) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2660,11 +3882,15 @@ nvmlReturn_t nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, frameRateLimit, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)frameRateLimit, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t* eccMode) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)eccMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEccMode) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2672,11 +3898,15 @@ nvmlReturn_t nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEna rpc_read(0, eccMode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)eccMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int* encoderCapacity) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderCapacity) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2684,11 +3914,15 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, encoderCapacity, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)encoderCapacity, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&encoderCapacity, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceSetEncoderCapacity) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2696,11 +3930,17 @@ nvmlReturn_t nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&encoderCapacity, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, unsigned int* averageFps, unsigned int* averageLatency) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderStats) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2710,11 +3950,20 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, un rpc_read(0, averageLatency, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)averageFps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)averageLatency, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, nvmlEncoderSessionInfo_t* sessionInfo) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetEncoderSessions) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2724,11 +3973,18 @@ nvmlReturn_t nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlEncoderSessionInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t* fbcStats) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFBCStats) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2736,11 +3992,18 @@ nvmlReturn_t nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFB rpc_read(0, fbcStats, sizeof(nvmlFBCStats_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fbcStats, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int* sessionCount, nvmlFBCSessionInfo_t* sessionInfo) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetFBCSessions) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2750,11 +4013,18 @@ nvmlReturn_t nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, uns rpc_read(0, sessionInfo, *sessionCount * sizeof(nvmlFBCSessionInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sessionInfo, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*sessionCount); i++) + maybe_copy_unified_arg(0, (void*)&sessionInfo[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int* gpuInstanceId) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuInstanceId, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetGpuInstanceId) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2762,11 +4032,18 @@ nvmlReturn_t nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, u rpc_read(0, gpuInstanceId, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuInstanceId, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char* vgpuPciId, unsigned int* length) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuPciId, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*length); i++) + maybe_copy_unified_arg(0, (void*)&vgpuPciId[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetGpuPciId) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2776,11 +4053,19 @@ nvmlReturn_t nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char* rpc_read(0, vgpuPciId, *length * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuPciId, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*length); i++) + maybe_copy_unified_arg(0, (void*)&vgpuPciId[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int* capResult) { + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuTypeGetCapabilities) < 0 || rpc_write(0, &vgpuTypeId, sizeof(nvmlVgpuTypeId_t)) < 0 || @@ -2789,11 +4074,19 @@ nvmlReturn_t nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCa rpc_read(0, capResult, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuTypeId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&capability, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)capResult, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t* vgpuMetadata, unsigned int* bufferSize) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&vgpuMetadata[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetMetadata) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2803,11 +4096,21 @@ nvmlReturn_t nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVg rpc_read(0, vgpuMetadata, *bufferSize * sizeof(nvmlVgpuMetadata_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&vgpuMetadata[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t* pgpuMetadata, unsigned int* bufferSize) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuMetadata) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2817,11 +4120,19 @@ nvmlReturn_t nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata rpc_read(0, pgpuMetadata, *bufferSize * sizeof(nvmlVgpuPgpuMetadata_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t* vgpuMetadata, nvmlVgpuPgpuMetadata_t* pgpuMetadata, nvmlVgpuPgpuCompatibility_t* compatibilityInfo) { + maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)compatibilityInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGetVgpuCompatibility) < 0 || rpc_write(0, vgpuMetadata, sizeof(nvmlVgpuMetadata_t)) < 0 || @@ -2831,11 +4142,19 @@ nvmlReturn_t nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t* vgpuMetadata, nvmlVgpu rpc_read(0, compatibilityInfo, sizeof(nvmlVgpuPgpuCompatibility_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)vgpuMetadata, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)compatibilityInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char* pgpuMetadata, unsigned int* bufferSize) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetPgpuMetadataString) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2845,11 +4164,18 @@ nvmlReturn_t nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char* pgpuMeta rpc_read(0, pgpuMetadata, *bufferSize * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pgpuMetadata, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*bufferSize); i++) + maybe_copy_unified_arg(0, (void*)&pgpuMetadata[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t* pSchedulerLog) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pSchedulerLog, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerLog) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2857,11 +4183,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedule rpc_read(0, pSchedulerLog, sizeof(nvmlVgpuSchedulerLog_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pSchedulerLog, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t* pSchedulerState) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pSchedulerState, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerState) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2869,11 +4199,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedu rpc_read(0, pSchedulerState, sizeof(nvmlVgpuSchedulerGetState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pSchedulerState, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t* pCapabilities) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pCapabilities, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuSchedulerCapabilities) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2881,11 +4215,15 @@ nvmlReturn_t nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgp rpc_read(0, pCapabilities, sizeof(nvmlVgpuSchedulerCapabilities_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pCapabilities, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGetVgpuVersion(nvmlVgpuVersion_t* supported, nvmlVgpuVersion_t* current) { + maybe_copy_unified_arg(0, (void*)supported, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGetVgpuVersion) < 0 || rpc_wait_for_response(0) < 0 || @@ -2893,22 +4231,33 @@ nvmlReturn_t nvmlGetVgpuVersion(nvmlVgpuVersion_t* supported, nvmlVgpuVersion_t* rpc_read(0, current, sizeof(nvmlVgpuVersion_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)supported, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)current, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlSetVgpuVersion(nvmlVgpuVersion_t* vgpuVersion) { + maybe_copy_unified_arg(0, (void*)vgpuVersion, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlSetVgpuVersion) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, vgpuVersion, sizeof(nvmlVgpuVersion_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)vgpuVersion, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, nvmlValueType_t* sampleValType, unsigned int* vgpuInstanceSamplesCount, nvmlVgpuInstanceUtilizationSample_t* utilizationSamples) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuInstanceSamplesCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*vgpuInstanceSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuUtilization) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2921,11 +4270,24 @@ nvmlReturn_t nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long lon rpc_read(0, utilizationSamples, *vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sampleValType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuInstanceSamplesCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*vgpuInstanceSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, unsigned int* vgpuProcessSamplesCount, nvmlVgpuProcessUtilizationSample_t* utilizationSamples) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)vgpuProcessSamplesCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*vgpuProcessSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetVgpuProcessUtilization) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -2936,11 +4298,19 @@ nvmlReturn_t nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned l rpc_read(0, utilizationSamples, *vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lastSeenTimeStamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)vgpuProcessSamplesCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)utilizationSamples, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*vgpuProcessSamplesCount); i++) + maybe_copy_unified_arg(0, (void*)&utilizationSamples[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t* mode) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingMode) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2948,11 +4318,18 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, mode, sizeof(nvmlEnableState_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int* count, unsigned int* pids) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingPids) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2962,11 +4339,19 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, pids, *count * sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pids, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&pids[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t* stats) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetAccountingStats) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2975,22 +4360,29 @@ nvmlReturn_t nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, stats, sizeof(nvmlAccountingStats_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pid, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)stats, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceClearAccountingPids) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t* licenseInfo) { + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)licenseInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlVgpuInstanceGetLicenseInfo_v2) < 0 || rpc_write(0, &vgpuInstance, sizeof(nvmlVgpuInstance_t)) < 0 || @@ -2998,22 +4390,28 @@ nvmlReturn_t nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, rpc_read(0, licenseInfo, sizeof(nvmlVgpuLicenseInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&vgpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)licenseInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGetExcludedDeviceCount(unsigned int* deviceCount) { + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGetExcludedDeviceCount) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, deviceCount, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)deviceCount, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGetExcludedDeviceInfoByIndex) < 0 || rpc_write(0, &index, sizeof(unsigned int)) < 0 || @@ -3021,11 +4419,16 @@ nvmlReturn_t nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDe rpc_read(0, info, sizeof(nvmlExcludedDeviceInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t* activationStatus) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)activationStatus, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetMigMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3034,11 +4437,17 @@ nvmlReturn_t nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlRe rpc_read(0, activationStatus, sizeof(nvmlReturn_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)activationStatus, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int* currentMode, unsigned int* pendingMode) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)currentMode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pendingMode, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMigMode) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3047,11 +4456,17 @@ nvmlReturn_t nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int* currentMode rpc_read(0, pendingMode, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)currentMode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pendingMode, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, nvmlGpuInstanceProfileInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceProfileInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3060,11 +4475,17 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned i rpc_read(0, info, sizeof(nvmlGpuInstanceProfileInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, nvmlGpuInstanceProfileInfo_v2_t* info) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceProfileInfoV) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3073,11 +4494,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned rpc_read(0, info, sizeof(nvmlGpuInstanceProfileInfo_v2_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t* placements, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstancePossiblePlacements_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3088,11 +4518,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, rpc_read(0, placements, *count * sizeof(nvmlGpuInstancePlacement_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceRemainingCapacity) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3101,11 +4540,17 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsi rpc_read(0, count, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstance_t* gpuInstance) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceCreateGpuInstance) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3114,22 +4559,33 @@ nvmlReturn_t nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profi rpc_read(0, gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceDestroy) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstance_t* gpuInstances, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuInstances, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&gpuInstances[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstances) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3140,11 +4596,20 @@ nvmlReturn_t nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profile rpc_read(0, gpuInstances, *count * sizeof(nvmlGpuInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuInstances, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&gpuInstances[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t* gpuInstance) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceById) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3153,11 +4618,16 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, rpc_read(0, gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetInfo) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3165,11 +4635,17 @@ nvmlReturn_t nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstan rpc_read(0, info, sizeof(nvmlGpuInstanceInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, unsigned int engProfile, nvmlComputeInstanceProfileInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceProfileInfo) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3179,11 +4655,19 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuI rpc_read(0, info, sizeof(nvmlComputeInstanceProfileInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, unsigned int engProfile, nvmlComputeInstanceProfileInfo_v2_t* info) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceProfileInfoV) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3193,11 +4677,18 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpu rpc_read(0, info, sizeof(nvmlComputeInstanceProfileInfo_v2_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&engProfile, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, unsigned int profileId, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceRemainingCapacity) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3206,11 +4697,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_ rpc_read(0, count, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstancePlacement_t* placements, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstancePossiblePlacements) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3221,11 +4721,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance rpc_read(0, placements, *count * sizeof(nvmlComputeInstancePlacement_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)placements, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&placements[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstance_t* computeInstance) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceCreateComputeInstance) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3234,22 +4743,33 @@ nvmlReturn_t nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, rpc_read(0, computeInstance, sizeof(nvmlComputeInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance) { + maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlComputeInstanceDestroy) < 0 || rpc_write(0, &computeInstance, sizeof(nvmlComputeInstance_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstance_t* computeInstances, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)computeInstances, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&computeInstances[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstances) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3260,11 +4780,20 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, u rpc_read(0, computeInstances, *count * sizeof(nvmlComputeInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&profileId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)computeInstances, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*count); i++) + maybe_copy_unified_arg(0, (void*)&computeInstances[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, nvmlComputeInstance_t* computeInstance) { + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpuInstanceGetComputeInstanceById) < 0 || rpc_write(0, &gpuInstance, sizeof(nvmlGpuInstance_t)) < 0 || @@ -3273,11 +4802,16 @@ nvmlReturn_t nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance rpc_read(0, computeInstance, sizeof(nvmlComputeInstance_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpuInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&id, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)computeInstance, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t* info) { + maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlComputeInstanceGetInfo_v2) < 0 || rpc_write(0, &computeInstance, sizeof(nvmlComputeInstance_t)) < 0 || @@ -3285,11 +4819,15 @@ nvmlReturn_t nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance rpc_read(0, info, sizeof(nvmlComputeInstanceInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&computeInstance, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int* isMigDevice) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isMigDevice, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceIsMigDeviceHandle) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3297,11 +4835,15 @@ nvmlReturn_t nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int* isMi rpc_read(0, isMigDevice, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isMigDevice, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int* id) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)id, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuInstanceId) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3309,11 +4851,15 @@ nvmlReturn_t nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int* id) rpc_read(0, id, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)id, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int* id) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)id, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetComputeInstanceId) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3321,11 +4867,15 @@ nvmlReturn_t nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int* i rpc_read(0, id, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)id, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int* count) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMaxMigDeviceCount) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3333,11 +4883,16 @@ nvmlReturn_t nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int* c rpc_read(0, count, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, nvmlDevice_t* migDevice) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)migDevice, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMigDeviceHandleByIndex) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3346,11 +4901,16 @@ nvmlReturn_t nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned i rpc_read(0, migDevice, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&index, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)migDevice, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t* device) { + maybe_copy_unified_arg(0, (void*)&migDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDeviceHandleFromMigDeviceHandle) < 0 || rpc_write(0, &migDevice, sizeof(nvmlDevice_t)) < 0 || @@ -3358,11 +4918,15 @@ nvmlReturn_t nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice rpc_read(0, device, sizeof(nvmlDevice_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&migDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t* type) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetBusType) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3370,11 +4934,15 @@ nvmlReturn_t nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t* type) rpc_read(0, type, sizeof(nvmlBusType_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t* pDynamicPstatesInfo) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDynamicPstatesInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetDynamicPstatesInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3382,11 +4950,16 @@ nvmlReturn_t nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamic rpc_read(0, pDynamicPstatesInfo, sizeof(nvmlGpuDynamicPstatesInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDynamicPstatesInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&speed, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetFanSpeed_v2) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3395,11 +4968,16 @@ nvmlReturn_t nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, uns rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&speed, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int* offset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpcClkVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3407,11 +4985,15 @@ nvmlReturn_t nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int* offset) rpc_read(0, offset, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetGpcClkVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3419,11 +5001,15 @@ nvmlReturn_t nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int* offset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemClkVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3431,11 +5017,15 @@ nvmlReturn_t nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int* offset) rpc_read(0, offset, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)offset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetMemClkVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3443,11 +5033,18 @@ nvmlReturn_t nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, unsigned int* minClockMHz, unsigned int* maxClockMHz) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pstate, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minClockMHz, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxClockMHz, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMinMaxClockOfPState) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3458,11 +5055,21 @@ nvmlReturn_t nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType rpc_read(0, maxClockMHz, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pstate, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minClockMHz, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxClockMHz, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, nvmlPstates_t* pstates, unsigned int size) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pstates, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&pstates[i], cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetSupportedPerformanceStates) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3471,11 +5078,19 @@ nvmlReturn_t nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, nvmlPs rpc_read(0, pstates, size * sizeof(nvmlPstates_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pstates, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(size); i++) + maybe_copy_unified_arg(0, (void*)&pstates[i], cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, int* minOffset, int* maxOffset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpcClkMinMaxVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3484,11 +5099,17 @@ nvmlReturn_t nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, int* minOffs rpc_read(0, maxOffset, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, int* minOffset, int* maxOffset) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetMemClkMinMaxVfOffset) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3497,11 +5118,16 @@ nvmlReturn_t nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, int* minOffs rpc_read(0, maxOffset, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxOffset, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpuFabricInfo, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceGetGpuFabricInfo) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3509,44 +5135,54 @@ nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t rpc_read(0, gpuFabricInfo, sizeof(nvmlGpuFabricInfo_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpuFabricInfo, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmMetricsGet(nvmlGpmMetricsGet_t* metricsGet) { + maybe_copy_unified_arg(0, (void*)metricsGet, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmMetricsGet) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, metricsGet, sizeof(nvmlGpmMetricsGet_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)metricsGet, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample) { + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmSampleFree) < 0 || rpc_write(0, &gpmSample, sizeof(nvmlGpmSample_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmSampleAlloc(nvmlGpmSample_t* gpmSample) { + maybe_copy_unified_arg(0, (void*)gpmSample, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmSampleAlloc) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, gpmSample, sizeof(nvmlGpmSample_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)gpmSample, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmSampleGet) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3554,11 +5190,16 @@ nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gpuInstanceId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmMigSampleGet) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3567,11 +5208,16 @@ nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gpuInstanceId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gpmSample, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t* gpmSupport) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)gpmSupport, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlGpmQueryDeviceSupport) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3579,11 +5225,15 @@ nvmlReturn_t nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t* gp rpc_read(0, gpmSupport, sizeof(nvmlGpmSupport_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)gpmSupport, cudaMemcpyDeviceToHost); return return_value; } nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t* info) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); nvmlReturn_t return_value; if (rpc_start_request(0, RPC_nvmlDeviceSetNvLinkDeviceLowPowerThreshold) < 0 || rpc_write(0, &device, sizeof(nvmlDevice_t)) < 0 || @@ -3591,33 +5241,41 @@ nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvm rpc_read(0, info, sizeof(nvmlNvLinkPowerThres_t)) < 0 || rpc_end_response(0, &return_value) < 0) return NVML_ERROR_GPU_IS_LOST; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } CUresult cuInit(unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuInit) < 0 || rpc_write(0, &Flags, sizeof(unsigned int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDriverGetVersion(int* driverVersion) { + maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDriverGetVersion) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, driverVersion, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGet(CUdevice* device, int ordinal) { + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ordinal, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGet) < 0 || rpc_write(0, &ordinal, sizeof(int)) < 0 || @@ -3625,22 +5283,31 @@ CUresult cuDeviceGet(CUdevice* device, int ordinal) rpc_read(0, device, sizeof(CUdevice)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ordinal, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetCount(int* count) { + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetCount) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, count, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetName(char* name, int len, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(len); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetName) < 0 || rpc_write(0, &len, sizeof(int)) < 0 || @@ -3649,11 +5316,20 @@ CUresult cuDeviceGetName(char* name, int len, CUdevice dev) rpc_read(0, name, len * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(len); i++) + maybe_copy_unified_arg(0, (void*)&name[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice); + for (int i = 0; i < 16; i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetUuid) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3661,11 +5337,19 @@ CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) rpc_read(0, uuid, 16) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost); + for (int i = 0; i < 16; i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyHostToDevice); + for (int i = 0; i < 16; i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetUuid_v2) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3673,11 +5357,18 @@ CUresult cuDeviceGetUuid_v2(CUuuid* uuid, CUdevice dev) rpc_read(0, uuid, 16) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)uuid, cudaMemcpyDeviceToHost); + for (int i = 0; i < 16; i++) + maybe_copy_unified_arg(0, (void*)&uuid[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetLuid(char* luid, unsigned int* deviceNodeMask, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)luid, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)deviceNodeMask, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; std::size_t luid_len; if (rpc_start_request(0, RPC_cuDeviceGetLuid) < 0 || @@ -3688,11 +5379,16 @@ CUresult cuDeviceGetLuid(char* luid, unsigned int* deviceNodeMask, CUdevice dev) rpc_read(0, deviceNodeMask, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)luid, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)deviceNodeMask, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceTotalMem_v2(size_t* bytes, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceTotalMem_v2) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3700,11 +5396,17 @@ CUresult cuDeviceTotalMem_v2(size_t* bytes, CUdevice dev) rpc_read(0, bytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numChannels, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetTexture1DLinearMaxWidth) < 0 || rpc_write(0, &format, sizeof(CUarray_format)) < 0 || @@ -3714,11 +5416,18 @@ CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, CUarray_ rpc_read(0, maxWidthInElements, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numChannels, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetAttribute) < 0 || rpc_write(0, &attrib, sizeof(CUdevice_attribute)) < 0 || @@ -3727,11 +5436,16 @@ CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) rpc_read(0, pi, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) { + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceSetMemPool) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3739,11 +5453,15 @@ CUresult cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetMemPool(CUmemoryPool* pool, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetMemPool) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3751,11 +5469,15 @@ CUresult cuDeviceGetMemPool(CUmemoryPool* pool, CUdevice dev) rpc_read(0, pool, sizeof(CUmemoryPool)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pool_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetDefaultMemPool) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3763,11 +5485,16 @@ CUresult cuDeviceGetDefaultMemPool(CUmemoryPool* pool_out, CUdevice dev) rpc_read(0, pool_out, sizeof(CUmemoryPool)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pool_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetExecAffinitySupport(int* pi, CUexecAffinityType type, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetExecAffinitySupport) < 0 || rpc_write(0, &type, sizeof(CUexecAffinityType)) < 0 || @@ -3776,11 +5503,16 @@ CUresult cuDeviceGetExecAffinitySupport(int* pi, CUexecAffinityType type, CUdevi rpc_read(0, pi, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) { + maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFlushGPUDirectRDMAWrites) < 0 || rpc_write(0, &target, sizeof(CUflushGPUDirectRDMAWritesTarget)) < 0 || @@ -3788,11 +5520,15 @@ CUresult cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUf rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetProperties) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3800,11 +5536,16 @@ CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev) rpc_read(0, prop, sizeof(CUdevprop)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)major, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceComputeCapability) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3813,11 +5554,16 @@ CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev) rpc_read(0, minor, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)major, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)minor, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDevicePrimaryCtxRetain) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3825,22 +5571,28 @@ CUresult cuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev) rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDevicePrimaryCtxRelease_v2(CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDevicePrimaryCtxRelease_v2) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice dev, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDevicePrimaryCtxSetFlags_v2) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3848,11 +5600,16 @@ CUresult cuDevicePrimaryCtxSetFlags_v2(CUdevice dev, unsigned int flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active) { + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)active, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDevicePrimaryCtxGetState) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || @@ -3861,22 +5618,30 @@ CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* acti rpc_read(0, active, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)active, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDevicePrimaryCtxReset_v2(CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDevicePrimaryCtxReset_v2) < 0 || rpc_write(0, &dev, sizeof(CUdevice)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxCreate_v2) < 0 || rpc_write(0, &flags, sizeof(unsigned int)) < 0 || @@ -3885,11 +5650,21 @@ CUresult cuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev) rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int numParams, unsigned int flags, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numParams); i++) + maybe_copy_unified_arg(0, (void*)¶msArray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxCreate_v3) < 0 || rpc_write(0, &numParams, sizeof(int)) < 0 || @@ -3900,88 +5675,111 @@ CUresult cuCtxCreate_v3(CUcontext* pctx, CUexecAffinityParam* paramsArray, int n rpc_read(0, paramsArray, numParams * sizeof(CUexecAffinityParam)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numParams); i++) + maybe_copy_unified_arg(0, (void*)¶msArray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxDestroy_v2(CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxDestroy_v2) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxPushCurrent_v2(CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxPushCurrent_v2) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxPopCurrent_v2(CUcontext* pctx) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxPopCurrent_v2) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxSetCurrent(CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxSetCurrent) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetCurrent(CUcontext* pctx) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetCurrent) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetDevice(CUdevice* device) { + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetDevice) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, device, sizeof(CUdevice)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetFlags(unsigned int* flags) { + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetFlags) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, flags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetId(CUcontext ctx, unsigned long long* ctxId) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)ctxId, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetId) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || @@ -3989,6 +5787,8 @@ CUresult cuCtxGetId(CUcontext ctx, unsigned long long* ctxId) rpc_read(0, ctxId, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)ctxId, cudaMemcpyDeviceToHost); return return_value; } @@ -4004,6 +5804,8 @@ CUresult cuCtxSynchronize() CUresult cuCtxSetLimit(CUlimit limit, size_t value) { + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxSetLimit) < 0 || rpc_write(0, &limit, sizeof(CUlimit)) < 0 || @@ -4011,11 +5813,15 @@ CUresult cuCtxSetLimit(CUlimit limit, size_t value) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetLimit(size_t* pvalue, CUlimit limit) { + maybe_copy_unified_arg(0, (void*)pvalue, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetLimit) < 0 || rpc_write(0, &limit, sizeof(CUlimit)) < 0 || @@ -4023,55 +5829,67 @@ CUresult cuCtxGetLimit(size_t* pvalue, CUlimit limit) rpc_read(0, pvalue, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pvalue, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetCacheConfig(CUfunc_cache* pconfig) { + maybe_copy_unified_arg(0, (void*)pconfig, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetCacheConfig) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, pconfig, sizeof(CUfunc_cache)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pconfig, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxSetCacheConfig(CUfunc_cache config) { + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxSetCacheConfig) < 0 || rpc_write(0, &config, sizeof(CUfunc_cache)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetSharedMemConfig(CUsharedconfig* pConfig) { + maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetSharedMemConfig) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, pConfig, sizeof(CUsharedconfig)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxSetSharedMemConfig(CUsharedconfig config) { + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxSetSharedMemConfig) < 0 || rpc_write(0, &config, sizeof(CUsharedconfig)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int* version) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetApiVersion) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || @@ -4079,11 +5897,15 @@ CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int* version) rpc_read(0, version, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority) { + maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetStreamPriorityRange) < 0 || rpc_wait_for_response(0) < 0 || @@ -4091,6 +5913,8 @@ CUresult cuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority) rpc_read(0, greatestPriority, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyDeviceToHost); return return_value; } @@ -4106,6 +5930,8 @@ CUresult cuCtxResetPersistingL2Cache() CUresult cuCtxGetExecAffinity(CUexecAffinityParam* pExecAffinity, CUexecAffinityType type) { + maybe_copy_unified_arg(0, (void*)pExecAffinity, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxGetExecAffinity) < 0 || rpc_write(0, &type, sizeof(CUexecAffinityType)) < 0 || @@ -4113,11 +5939,15 @@ CUresult cuCtxGetExecAffinity(CUexecAffinityParam* pExecAffinity, CUexecAffinity rpc_read(0, pExecAffinity, sizeof(CUexecAffinityParam)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pExecAffinity, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxAttach) < 0 || rpc_write(0, &flags, sizeof(unsigned int)) < 0 || @@ -4125,22 +5955,28 @@ CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags) rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxDetach(CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxDetach) < 0 || rpc_write(0, &ctx, sizeof(CUcontext)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleLoad(CUmodule* module, const char* fname) { + maybe_copy_unified_arg(0, (void*)module, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fname, cudaMemcpyHostToDevice); CUresult return_value; std::size_t fname_len = std::strlen(fname) + 1; if (rpc_start_request(0, RPC_cuModuleLoad) < 0 || @@ -4150,22 +5986,27 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname) rpc_read(0, module, sizeof(CUmodule)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)module, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fname, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleUnload(CUmodule hmod) { + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuModuleUnload) < 0 || rpc_write(0, &hmod, sizeof(CUmodule)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode* mode) { + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuModuleGetLoadingMode) < 0 || rpc_write(0, mode, sizeof(CUmoduleLoadingMode)) < 0 || @@ -4173,11 +6014,15 @@ CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode* mode) rpc_read(0, mode, sizeof(CUmoduleLoadingMode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) { + maybe_copy_unified_arg(0, (void*)hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuModuleGetFunction) < 0 || @@ -4188,11 +6033,18 @@ CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) rpc_read(0, hfunc, sizeof(CUfunction)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuModuleGetGlobal_v2) < 0 || @@ -4204,11 +6056,19 @@ CUresult cuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, c rpc_read(0, bytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) { + maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)options, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)stateOut, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLinkCreate_v2) < 0 || rpc_write(0, &numOptions, sizeof(unsigned int)) < 0 || @@ -4221,11 +6081,25 @@ CUresult cuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void** rpc_read(0, stateOut, sizeof(CUlinkState)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)options, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)stateOut, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void** optionValues) { + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)options, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numOptions); i++) + maybe_copy_unified_arg(0, (void*)&options[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numOptions); i++) + maybe_copy_unified_arg(0, (void*)&optionValues[i], cudaMemcpyHostToDevice); CUresult return_value; std::size_t path_len = std::strlen(path) + 1; if (rpc_start_request(0, RPC_cuLinkAddFile_v2) < 0 || @@ -4239,11 +6113,24 @@ CUresult cuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* pa rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numOptions, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)options, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numOptions); i++) + maybe_copy_unified_arg(0, (void*)&options[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)optionValues, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numOptions); i++) + maybe_copy_unified_arg(0, (void*)&optionValues[i], cudaMemcpyDeviceToHost); return return_value; } CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) { + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)cubinOut, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeOut, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLinkComplete) < 0 || rpc_write(0, &state, sizeof(CUlinkState)) < 0 || @@ -4252,22 +6139,30 @@ CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) rpc_read(0, sizeOut, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)cubinOut, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeOut, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLinkDestroy(CUlinkState state) { + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLinkDestroy) < 0 || rpc_write(0, &state, sizeof(CUlinkState)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&state, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name) { + maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuModuleGetTexRef) < 0 || @@ -4278,11 +6173,17 @@ CUresult cuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name) rpc_read(0, pTexRef, sizeof(CUtexref)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name) { + maybe_copy_unified_arg(0, (void*)pSurfRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuModuleGetSurfRef) < 0 || @@ -4293,11 +6194,30 @@ CUresult cuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name rpc_read(0, pSurfRef, sizeof(CUsurfref)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pSurfRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hmod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryLoadFromFile(CUlibrary* library, const char* fileName, CUjit_option* jitOptions, void** jitOptionsValues, unsigned int numJitOptions, CUlibraryOption* libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions) { + maybe_copy_unified_arg(0, (void*)library, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fileName, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numJitOptions, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)jitOptions, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numJitOptions); i++) + maybe_copy_unified_arg(0, (void*)&jitOptions[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)jitOptionsValues, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numJitOptions); i++) + maybe_copy_unified_arg(0, (void*)&jitOptionsValues[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numLibraryOptions, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)libraryOptions, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numLibraryOptions); i++) + maybe_copy_unified_arg(0, (void*)&libraryOptions[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)libraryOptionValues, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numLibraryOptions); i++) + maybe_copy_unified_arg(0, (void*)&libraryOptionValues[i], cudaMemcpyHostToDevice); CUresult return_value; std::size_t fileName_len = std::strlen(fileName) + 1; if (rpc_start_request(0, RPC_cuLibraryLoadFromFile) < 0 || @@ -4313,22 +6233,43 @@ CUresult cuLibraryLoadFromFile(CUlibrary* library, const char* fileName, CUjit_o rpc_read(0, library, sizeof(CUlibrary)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)library, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fileName, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numJitOptions, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)jitOptions, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numJitOptions); i++) + maybe_copy_unified_arg(0, (void*)&jitOptions[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)jitOptionsValues, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numJitOptions); i++) + maybe_copy_unified_arg(0, (void*)&jitOptionsValues[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numLibraryOptions, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)libraryOptions, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numLibraryOptions); i++) + maybe_copy_unified_arg(0, (void*)&libraryOptions[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)libraryOptionValues, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numLibraryOptions); i++) + maybe_copy_unified_arg(0, (void*)&libraryOptionValues[i], cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryUnload(CUlibrary library) { + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLibraryUnload) < 0 || rpc_write(0, &library, sizeof(CUlibrary)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* name) { + maybe_copy_unified_arg(0, (void*)pKernel, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuLibraryGetKernel) < 0 || @@ -4339,11 +6280,16 @@ CUresult cuLibraryGetKernel(CUkernel* pKernel, CUlibrary library, const char* na rpc_read(0, pKernel, sizeof(CUkernel)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pKernel, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library) { + maybe_copy_unified_arg(0, (void*)pMod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLibraryGetModule) < 0 || rpc_write(0, &library, sizeof(CUlibrary)) < 0 || @@ -4351,11 +6297,15 @@ CUresult cuLibraryGetModule(CUmodule* pMod, CUlibrary library) rpc_read(0, pMod, sizeof(CUmodule)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pMod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); return return_value; } CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel) { + maybe_copy_unified_arg(0, (void*)pFunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuKernelGetFunction) < 0 || rpc_write(0, &kernel, sizeof(CUkernel)) < 0 || @@ -4363,11 +6313,17 @@ CUresult cuKernelGetFunction(CUfunction* pFunc, CUkernel kernel) rpc_read(0, pFunc, sizeof(CUfunction)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pFunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUlibrary library, const char* name) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuLibraryGetGlobal) < 0 || @@ -4379,11 +6335,19 @@ CUresult cuLibraryGetGlobal(CUdeviceptr* dptr, size_t* bytes, CUlibrary library, rpc_read(0, bytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryGetManaged(CUdeviceptr* dptr, size_t* bytes, CUlibrary library, const char* name) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyHostToDevice); CUresult return_value; std::size_t name_len = std::strlen(name) + 1; if (rpc_start_request(0, RPC_cuLibraryGetManaged) < 0 || @@ -4395,11 +6359,18 @@ CUresult cuLibraryGetManaged(CUdeviceptr* dptr, size_t* bytes, CUlibrary library rpc_read(0, bytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)name, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLibraryGetUnifiedFunction(void** fptr, CUlibrary library, const char* symbol) { + maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); CUresult return_value; std::size_t symbol_len = std::strlen(symbol) + 1; if (rpc_start_request(0, RPC_cuLibraryGetUnifiedFunction) < 0 || @@ -4410,11 +6381,18 @@ CUresult cuLibraryGetUnifiedFunction(void** fptr, CUlibrary library, const char* rpc_read(0, fptr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&library, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); return return_value; } CUresult cuKernelGetAttribute(int* pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuKernelGetAttribute) < 0 || rpc_write(0, pi, sizeof(int)) < 0 || @@ -4425,11 +6403,19 @@ CUresult cuKernelGetAttribute(int* pi, CUfunction_attribute attrib, CUkernel ker rpc_read(0, pi, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&val, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuKernelSetAttribute) < 0 || rpc_write(0, &attrib, sizeof(CUfunction_attribute)) < 0 || @@ -4439,11 +6425,18 @@ CUresult cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel ker rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&val, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuKernelSetCacheConfig) < 0 || rpc_write(0, &kernel, sizeof(CUkernel)) < 0 || @@ -4452,11 +6445,16 @@ CUresult cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice d rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&kernel, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemGetInfo_v2(size_t* free, size_t* total) { + maybe_copy_unified_arg(0, (void*)free, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)total, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemGetInfo_v2) < 0 || rpc_write(0, free, sizeof(size_t)) < 0 || @@ -4466,11 +6464,15 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total) rpc_read(0, total, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)free, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)total, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAlloc_v2) < 0 || rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 || @@ -4479,11 +6481,18 @@ CUresult cuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize) rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&WidthInBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ElementSizeBytes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAllocPitch_v2) < 0 || rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 || @@ -4496,22 +6505,32 @@ CUresult cuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInByt rpc_read(0, pPitch, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&WidthInBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ElementSizeBytes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemFree_v2(CUdeviceptr dptr) { + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemFree_v2) < 0 || rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr) { + maybe_copy_unified_arg(0, (void*)pbase, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)psize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemGetAddressRange_v2) < 0 || rpc_write(0, pbase, sizeof(CUdeviceptr)) < 0 || @@ -4522,11 +6541,16 @@ CUresult cuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr rpc_read(0, psize, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pbase, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)psize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAllocHost_v2(void** pp, size_t bytesize) { + maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAllocHost_v2) < 0 || rpc_write(0, &bytesize, sizeof(size_t)) < 0 || @@ -4534,22 +6558,29 @@ CUresult cuMemAllocHost_v2(void** pp, size_t bytesize) rpc_read(0, pp, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemFreeHost(void* p) { + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemFreeHost) < 0 || rpc_write(0, &p, sizeof(void*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemHostAlloc) < 0 || rpc_write(0, &bytesize, sizeof(size_t)) < 0 || @@ -4558,11 +6589,17 @@ CUresult cuMemHostAlloc(void** pp, size_t bytesize, unsigned int Flags) rpc_read(0, pp, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemHostGetDevicePointer_v2) < 0 || rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 || @@ -4572,11 +6609,16 @@ CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemHostGetFlags(unsigned int* pFlags, void* p) { + maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemHostGetFlags) < 0 || rpc_write(0, pFlags, sizeof(unsigned int)) < 0 || @@ -4585,11 +6627,16 @@ CUresult cuMemHostGetFlags(unsigned int* pFlags, void* p) rpc_read(0, pFlags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAllocManaged) < 0 || rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 || @@ -4599,11 +6646,16 @@ CUresult cuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flag rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetByPCIBusId(CUdevice* dev, const char* pciBusId) { + maybe_copy_unified_arg(0, (void*)dev, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice); CUresult return_value; std::size_t pciBusId_len = std::strlen(pciBusId) + 1; if (rpc_start_request(0, RPC_cuDeviceGetByPCIBusId) < 0 || @@ -4614,11 +6666,18 @@ CUresult cuDeviceGetByPCIBusId(CUdevice* dev, const char* pciBusId) rpc_read(0, dev, sizeof(CUdevice)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dev, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetPCIBusId(char* pciBusId, int len, CUdevice dev) { + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(len); i++) + maybe_copy_unified_arg(0, (void*)&pciBusId[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetPCIBusId) < 0 || rpc_write(0, &len, sizeof(int)) < 0 || @@ -4627,11 +6686,18 @@ CUresult cuDeviceGetPCIBusId(char* pciBusId, int len, CUdevice dev) rpc_read(0, pciBusId, len * sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(len); i++) + maybe_copy_unified_arg(0, (void*)&pciBusId[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuIpcGetEventHandle(CUipcEventHandle* pHandle, CUevent event) { + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuIpcGetEventHandle) < 0 || rpc_write(0, pHandle, sizeof(CUipcEventHandle)) < 0 || @@ -4640,11 +6706,15 @@ CUresult cuIpcGetEventHandle(CUipcEventHandle* pHandle, CUevent event) rpc_read(0, pHandle, sizeof(CUipcEventHandle)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuIpcOpenEventHandle(CUevent* phEvent, CUipcEventHandle handle) { + maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuIpcOpenEventHandle) < 0 || rpc_write(0, phEvent, sizeof(CUevent)) < 0 || @@ -4653,11 +6723,15 @@ CUresult cuIpcOpenEventHandle(CUevent* phEvent, CUipcEventHandle handle) rpc_read(0, phEvent, sizeof(CUevent)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } CUresult cuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) { + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuIpcGetMemHandle) < 0 || rpc_write(0, pHandle, sizeof(CUipcMemHandle)) < 0 || @@ -4666,11 +6740,16 @@ CUresult cuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) rpc_read(0, pHandle, sizeof(CUipcMemHandle)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuIpcOpenMemHandle_v2(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuIpcOpenMemHandle_v2) < 0 || rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 || @@ -4680,22 +6759,30 @@ CUresult cuIpcOpenMemHandle_v2(CUdeviceptr* pdptr, CUipcMemHandle handle, unsign rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuIpcCloseMemHandle(CUdeviceptr dptr) { + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuIpcCloseMemHandle) < 0 || rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpy) < 0 || rpc_write(0, &dst, sizeof(CUdeviceptr)) < 0 || @@ -4704,11 +6791,19 @@ CUresult cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyPeer) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4719,11 +6814,19 @@ CUresult cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr s rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyHtoD_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4732,11 +6835,17 @@ CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void* srcHost, size_t Byte rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyDtoD_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4745,11 +6854,18 @@ CUresult cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t By rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyDtoA_v2) < 0 || rpc_write(0, &dstArray, sizeof(CUarray)) < 0 || @@ -4759,11 +6875,19 @@ CUresult cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevi rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyAtoD_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4773,11 +6897,19 @@ CUresult cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffs rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyAtoH_v2(void* dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)dstHost, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyAtoH_v2) < 0 || rpc_write(0, &dstHost, sizeof(void*)) < 0 || @@ -4787,11 +6919,20 @@ CUresult cuMemcpyAtoH_v2(void* dstHost, CUarray srcArray, size_t srcOffset, size rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dstHost, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyAtoA_v2) < 0 || rpc_write(0, &dstArray, sizeof(CUarray)) < 0 || @@ -4802,11 +6943,20 @@ CUresult cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, s rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyAsync) < 0 || rpc_write(0, &dst, sizeof(CUdeviceptr)) < 0 || @@ -4816,11 +6966,21 @@ CUresult cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyPeerAsync) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4832,11 +6992,21 @@ CUresult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdevice rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstContext, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcContext, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyHtoDAsync_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4846,11 +7016,19 @@ CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void* srcHost, size_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)srcHost, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemcpyDtoDAsync_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4860,11 +7038,18 @@ CUresult cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ByteCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD8_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4873,11 +7058,17 @@ CUresult cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD16_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4886,11 +7077,17 @@ CUresult cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD32_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4899,11 +7096,19 @@ CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D8_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4914,11 +7119,21 @@ CUresult cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char u rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D16_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4929,11 +7144,21 @@ CUresult cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D32_v2) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4944,11 +7169,20 @@ CUresult cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int u rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD8Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4958,11 +7192,19 @@ CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUst rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD16Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4972,11 +7214,19 @@ CUresult cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CU rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD32Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -4986,11 +7236,21 @@ CUresult cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUst rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&N, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D8Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -5002,11 +7262,23 @@ CUresult cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D16Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -5018,11 +7290,23 @@ CUresult cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned sho rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&us, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemsetD2D32Async) < 0 || rpc_write(0, &dstDevice, sizeof(CUdeviceptr)) < 0 || @@ -5034,11 +7318,19 @@ CUresult cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstPitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ui, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayCreate_v2(CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAllocateArray) { + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayCreate_v2) < 0 || rpc_write(0, pHandle, sizeof(CUarray)) < 0 || @@ -5047,11 +7339,15 @@ CUresult cuArrayCreate_v2(CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAlloca rpc_read(0, pHandle, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarray hArray) { + maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayGetDescriptor_v2) < 0 || rpc_write(0, pArrayDescriptor, sizeof(CUDA_ARRAY_DESCRIPTOR)) < 0 || @@ -5060,11 +7356,15 @@ CUresult cuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarra rpc_read(0, pArrayDescriptor, sizeof(CUDA_ARRAY_DESCRIPTOR)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperties, CUarray array) { + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayGetSparseProperties) < 0 || rpc_write(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 || @@ -5073,11 +7373,15 @@ CUresult cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperti rpc_read(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* sparseProperties, CUmipmappedArray mipmap) { + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMipmappedArrayGetSparseProperties) < 0 || rpc_write(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 || @@ -5086,11 +7390,16 @@ CUresult cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES* spars rpc_read(0, sparseProperties, sizeof(CUDA_ARRAY_SPARSE_PROPERTIES)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequirements, CUarray array, CUdevice device) { + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayGetMemoryRequirements) < 0 || rpc_write(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 || @@ -5100,11 +7409,17 @@ CUresult cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequ rpc_read(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* memoryRequirements, CUmipmappedArray mipmap, CUdevice device) { + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMipmappedArrayGetMemoryRequirements) < 0 || rpc_write(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 || @@ -5114,11 +7429,17 @@ CUresult cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS* m rpc_read(0, memoryRequirements, sizeof(CUDA_ARRAY_MEMORY_REQUIREMENTS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayGetPlane(CUarray* pPlaneArray, CUarray hArray, unsigned int planeIdx) { + maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayGetPlane) < 0 || rpc_write(0, pPlaneArray, sizeof(CUarray)) < 0 || @@ -5128,22 +7449,29 @@ CUresult cuArrayGetPlane(CUarray* pPlaneArray, CUarray hArray, unsigned int plan rpc_read(0, pPlaneArray, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArrayDestroy(CUarray hArray) { + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArrayDestroy) < 0 || rpc_write(0, &hArray, sizeof(CUarray)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArray3DCreate_v2(CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAllocateArray) { + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArray3DCreate_v2) < 0 || rpc_write(0, pHandle, sizeof(CUarray)) < 0 || @@ -5152,11 +7480,15 @@ CUresult cuArray3DCreate_v2(CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAl rpc_read(0, pHandle, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pAllocateArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CUarray hArray) { + maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuArray3DGetDescriptor_v2) < 0 || rpc_write(0, pArrayDescriptor, sizeof(CUDA_ARRAY3D_DESCRIPTOR)) < 0 || @@ -5165,11 +7497,16 @@ CUresult cuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CU rpc_read(0, pArrayDescriptor, sizeof(CUDA_ARRAY3D_DESCRIPTOR)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pArrayDescriptor, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned int numMipmapLevels) { + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pMipmappedArrayDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numMipmapLevels, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMipmappedArrayCreate) < 0 || rpc_write(0, pHandle, sizeof(CUmipmappedArray)) < 0 || @@ -5179,11 +7516,17 @@ CUresult cuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DE rpc_read(0, pHandle, sizeof(CUmipmappedArray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pHandle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pMipmappedArrayDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numMipmapLevels, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level) { + maybe_copy_unified_arg(0, (void*)pLevelArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMipmappedArrayGetLevel) < 0 || rpc_write(0, pLevelArray, sizeof(CUarray)) < 0 || @@ -5193,22 +7536,32 @@ CUresult cuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmap rpc_read(0, pLevelArray, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pLevelArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) { + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMipmappedArrayDestroy) < 0 || rpc_write(0, &hMipmappedArray, sizeof(CUmipmappedArray)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&alignment, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAddressReserve) < 0 || rpc_write(0, ptr, sizeof(CUdeviceptr)) < 0 || @@ -5220,11 +7573,18 @@ CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment, CU rpc_read(0, ptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&alignment, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) { + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAddressFree) < 0 || rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 || @@ -5232,11 +7592,17 @@ CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size, const CUmemAllocationProp* prop, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemCreate) < 0 || rpc_write(0, handle, sizeof(CUmemGenericAllocationHandle)) < 0 || @@ -5247,22 +7613,33 @@ CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size, const CU rpc_read(0, handle, sizeof(CUmemGenericAllocationHandle)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemRelease(CUmemGenericAllocationHandle handle) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemRelease) < 0 || rpc_write(0, &handle, sizeof(CUmemGenericAllocationHandle)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemMap) < 0 || rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 || @@ -5273,11 +7650,19 @@ CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAlloc rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemMapArrayAsync(CUarrayMapInfo* mapInfoList, unsigned int count, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)mapInfoList, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemMapArrayAsync) < 0 || rpc_write(0, mapInfoList, sizeof(CUarrayMapInfo)) < 0 || @@ -5287,11 +7672,16 @@ CUresult cuMemMapArrayAsync(CUarrayMapInfo* mapInfoList, unsigned int count, CUs rpc_read(0, mapInfoList, sizeof(CUarrayMapInfo)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)mapInfoList, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) { + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemUnmap) < 0 || rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 || @@ -5299,11 +7689,17 @@ CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc* desc, size_t count) { + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemSetAccess) < 0 || rpc_write(0, &ptr, sizeof(CUdeviceptr)) < 0 || @@ -5313,11 +7709,18 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc* des rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemGetAccess(unsigned long long* flags, const CUmemLocation* location, CUdeviceptr ptr) { + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemGetAccess) < 0 || rpc_write(0, flags, sizeof(unsigned long long)) < 0 || @@ -5327,11 +7730,17 @@ CUresult cuMemGetAccess(unsigned long long* flags, const CUmemLocation* location rpc_read(0, flags, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemGetAllocationGranularity(size_t* granularity, const CUmemAllocationProp* prop, CUmemAllocationGranularity_flags option) { + maybe_copy_unified_arg(0, (void*)granularity, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&option, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemGetAllocationGranularity) < 0 || rpc_write(0, granularity, sizeof(size_t)) < 0 || @@ -5341,11 +7750,16 @@ CUresult cuMemGetAllocationGranularity(size_t* granularity, const CUmemAllocatio rpc_read(0, granularity, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)granularity, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&option, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp* prop, CUmemGenericAllocationHandle handle) { + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemGetAllocationPropertiesFromHandle) < 0 || rpc_write(0, prop, sizeof(CUmemAllocationProp)) < 0 || @@ -5354,11 +7768,15 @@ CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp* prop, CUmem rpc_read(0, prop, sizeof(CUmemAllocationProp)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemFreeAsync) < 0 || rpc_write(0, &dptr, sizeof(CUdeviceptr)) < 0 || @@ -5366,11 +7784,16 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAllocAsync(CUdeviceptr* dptr, size_t bytesize, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAllocAsync) < 0 || rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 || @@ -5380,11 +7803,16 @@ CUresult cuMemAllocAsync(CUdeviceptr* dptr, size_t bytesize, CUstream hStream) rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) { + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolTrimTo) < 0 || rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 || @@ -5392,11 +7820,16 @@ CUresult cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc* map, size_t count) { + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)map, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolSetAccess) < 0 || rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 || @@ -5405,11 +7838,17 @@ CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc* map, size_ rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)map, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolGetAccess(CUmemAccess_flags* flags, CUmemoryPool memPool, CUmemLocation* location) { + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolGetAccess) < 0 || rpc_write(0, flags, sizeof(CUmemAccess_flags)) < 0 || @@ -5420,11 +7859,16 @@ CUresult cuMemPoolGetAccess(CUmemAccess_flags* flags, CUmemoryPool memPool, CUme rpc_read(0, location, sizeof(CUmemLocation)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolProps) { + maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolCreate) < 0 || rpc_write(0, pool, sizeof(CUmemoryPool)) < 0 || @@ -5433,22 +7877,30 @@ CUresult cuMemPoolCreate(CUmemoryPool* pool, const CUmemPoolProps* poolProps) rpc_read(0, pool, sizeof(CUmemoryPool)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolDestroy(CUmemoryPool pool) { + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolDestroy) < 0 || rpc_write(0, &pool, sizeof(CUmemoryPool)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAllocFromPoolAsync) < 0 || rpc_write(0, dptr, sizeof(CUdeviceptr)) < 0 || @@ -5459,11 +7911,17 @@ CUresult cuMemAllocFromPoolAsync(CUdeviceptr* dptr, size_t bytesize, CUmemoryPoo rpc_read(0, dptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytesize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolExportPointer(CUmemPoolPtrExportData* shareData_out, CUdeviceptr ptr) { + maybe_copy_unified_arg(0, (void*)shareData_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolExportPointer) < 0 || rpc_write(0, shareData_out, sizeof(CUmemPoolPtrExportData)) < 0 || @@ -5472,11 +7930,16 @@ CUresult cuMemPoolExportPointer(CUmemPoolPtrExportData* shareData_out, CUdevicep rpc_read(0, shareData_out, sizeof(CUmemPoolPtrExportData)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)shareData_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPoolImportPointer(CUdeviceptr* ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData* shareData) { + maybe_copy_unified_arg(0, (void*)ptr_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)shareData, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPoolImportPointer) < 0 || rpc_write(0, ptr_out, sizeof(CUdeviceptr)) < 0 || @@ -5487,11 +7950,18 @@ CUresult cuMemPoolImportPointer(CUdeviceptr* ptr_out, CUmemoryPool pool, CUmemPo rpc_read(0, shareData, sizeof(CUmemPoolPtrExportData)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)ptr_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&pool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)shareData, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemPrefetchAsync) < 0 || rpc_write(0, &devPtr, sizeof(CUdeviceptr)) < 0 || @@ -5501,11 +7971,19 @@ CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) { + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemAdvise) < 0 || rpc_write(0, &devPtr, sizeof(CUdeviceptr)) < 0 || @@ -5515,11 +7993,21 @@ CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUde rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } CUresult cuMemRangeGetAttributes(void** data, size_t* dataSizes, CUmem_range_attribute* attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count) { + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuMemRangeGetAttributes) < 0 || rpc_write(0, data, sizeof(void*)) < 0 || @@ -5534,11 +8022,20 @@ CUresult cuMemRangeGetAttributes(void** data, size_t* dataSizes, CUmem_range_att rpc_read(0, attributes, sizeof(CUmem_range_attribute)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuPointerSetAttribute(const void* value, CUpointer_attribute attribute, CUdeviceptr ptr) { + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attribute, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuPointerSetAttribute) < 0 || rpc_write(0, &value, sizeof(const void*)) < 0 || @@ -5547,11 +8044,18 @@ CUresult cuPointerSetAttribute(const void* value, CUpointer_attribute attribute, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attribute, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* attributes, void** data, CUdeviceptr ptr) { + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuPointerGetAttributes) < 0 || rpc_write(0, &numAttributes, sizeof(unsigned int)) < 0 || @@ -5563,11 +8067,17 @@ CUresult cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* rpc_read(0, data, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamCreate(CUstream* phStream, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamCreate) < 0 || rpc_write(0, phStream, sizeof(CUstream)) < 0 || @@ -5576,11 +8086,16 @@ CUresult cuStreamCreate(CUstream* phStream, unsigned int Flags) rpc_read(0, phStream, sizeof(CUstream)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority) { + maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamCreateWithPriority) < 0 || rpc_write(0, phStream, sizeof(CUstream)) < 0 || @@ -5590,11 +8105,16 @@ CUresult cuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int rpc_read(0, phStream, sizeof(CUstream)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamGetPriority(CUstream hStream, int* priority) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamGetPriority) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5603,11 +8123,15 @@ CUresult cuStreamGetPriority(CUstream hStream, int* priority) rpc_read(0, priority, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamGetFlags) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5616,11 +8140,15 @@ CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags) rpc_read(0, flags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamGetId(CUstream hStream, unsigned long long* streamId) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamGetId) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5629,11 +8157,15 @@ CUresult cuStreamGetId(CUstream hStream, unsigned long long* streamId) rpc_read(0, streamId, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamGetCtx) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5642,11 +8174,16 @@ CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx) rpc_read(0, pctx, sizeof(CUcontext)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamWaitEvent) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5655,11 +8192,16 @@ CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamBeginCapture_v2) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5667,11 +8209,14 @@ CUresult cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode* mode) { + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuThreadExchangeStreamCaptureMode) < 0 || rpc_write(0, mode, sizeof(CUstreamCaptureMode)) < 0 || @@ -5679,11 +8224,14 @@ CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode* mode) rpc_read(0, mode, sizeof(CUstreamCaptureMode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamEndCapture) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5692,11 +8240,15 @@ CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph) rpc_read(0, phGraph, sizeof(CUgraph)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureStatus) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)captureStatus, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamIsCapturing) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5705,11 +8257,17 @@ CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureSta rpc_read(0, captureStatus, sizeof(CUstreamCaptureStatus)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)captureStatus, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* dependencies, size_t numDependencies, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamUpdateCaptureDependencies) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5720,11 +8278,19 @@ CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode* depend rpc_read(0, dependencies, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamAttachMemAsync) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5734,44 +8300,56 @@ CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t lengt rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&length, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamQuery(CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamQuery) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamSynchronize(CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamSynchronize) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamDestroy_v2(CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamDestroy_v2) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamCopyAttributes(CUstream dst, CUstream src) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamCopyAttributes) < 0 || rpc_write(0, &dst, sizeof(CUstream)) < 0 || @@ -5779,11 +8357,16 @@ CUresult cuStreamCopyAttributes(CUstream dst, CUstream src) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue* value_out) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamGetAttribute) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5793,11 +8376,17 @@ CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAtt rpc_read(0, value_out, sizeof(CUstreamAttrValue)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue* value) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamSetAttribute) < 0 || rpc_write(0, &hStream, sizeof(CUstream)) < 0 || @@ -5806,11 +8395,16 @@ CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventCreate(CUevent* phEvent, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventCreate) < 0 || rpc_write(0, phEvent, sizeof(CUevent)) < 0 || @@ -5819,11 +8413,15 @@ CUresult cuEventCreate(CUevent* phEvent, unsigned int Flags) rpc_read(0, phEvent, sizeof(CUevent)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phEvent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventRecord(CUevent hEvent, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventRecord) < 0 || rpc_write(0, &hEvent, sizeof(CUevent)) < 0 || @@ -5831,11 +8429,16 @@ CUresult cuEventRecord(CUevent hEvent, CUstream hStream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventRecordWithFlags) < 0 || rpc_write(0, &hEvent, sizeof(CUevent)) < 0 || @@ -5844,44 +8447,56 @@ CUresult cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int f rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventQuery(CUevent hEvent) { + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventQuery) < 0 || rpc_write(0, &hEvent, sizeof(CUevent)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventSynchronize(CUevent hEvent) { + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventSynchronize) < 0 || rpc_write(0, &hEvent, sizeof(CUevent)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventDestroy_v2(CUevent hEvent) { + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventDestroy_v2) < 0 || rpc_write(0, &hEvent, sizeof(CUevent)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hEvent, cudaMemcpyDeviceToHost); return return_value; } CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) { + maybe_copy_unified_arg(0, (void*)pMilliseconds, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStart, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hEnd, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuEventElapsedTime) < 0 || rpc_write(0, pMilliseconds, sizeof(float)) < 0 || @@ -5891,11 +8506,16 @@ CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd) rpc_read(0, pMilliseconds, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pMilliseconds, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStart, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hEnd, cudaMemcpyDeviceToHost); return return_value; } CUresult cuImportExternalMemory(CUexternalMemory* extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC* memHandleDesc) { + maybe_copy_unified_arg(0, (void*)extMem_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)memHandleDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuImportExternalMemory) < 0 || rpc_write(0, extMem_out, sizeof(CUexternalMemory)) < 0 || @@ -5904,11 +8524,16 @@ CUresult cuImportExternalMemory(CUexternalMemory* extMem_out, const CUDA_EXTERNA rpc_read(0, extMem_out, sizeof(CUexternalMemory)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)extMem_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)memHandleDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr* devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC* bufferDesc) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuExternalMemoryGetMappedBuffer) < 0 || rpc_write(0, devPtr, sizeof(CUdeviceptr)) < 0 || @@ -5918,11 +8543,17 @@ CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr* devPtr, CUexternalMemory e rpc_read(0, devPtr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray* mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* mipmapDesc) { + maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuExternalMemoryGetMappedMipmappedArray) < 0 || rpc_write(0, mipmap, sizeof(CUmipmappedArray)) < 0 || @@ -5932,22 +8563,29 @@ CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray* mipmap, CUext rpc_read(0, mipmap, sizeof(CUmipmappedArray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDestroyExternalMemory(CUexternalMemory extMem) { + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDestroyExternalMemory) < 0 || rpc_write(0, &extMem, sizeof(CUexternalMemory)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); return return_value; } CUresult cuImportExternalSemaphore(CUexternalSemaphore* extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* semHandleDesc) { + maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuImportExternalSemaphore) < 0 || rpc_write(0, extSem_out, sizeof(CUexternalSemaphore)) < 0 || @@ -5956,11 +8594,17 @@ CUresult cuImportExternalSemaphore(CUexternalSemaphore* extSem_out, const CUDA_E rpc_read(0, extSem_out, sizeof(CUexternalSemaphore)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream) { + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSignalExternalSemaphoresAsync) < 0 || rpc_write(0, &extSemArray, sizeof(const CUexternalSemaphore*)) < 0 || @@ -5970,11 +8614,19 @@ CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream) { + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuWaitExternalSemaphoresAsync) < 0 || rpc_write(0, &extSemArray, sizeof(const CUexternalSemaphore*)) < 0 || @@ -5984,22 +8636,32 @@ CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, c rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDestroyExternalSemaphore(CUexternalSemaphore extSem) { + maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDestroyExternalSemaphore) < 0 || rpc_write(0, &extSem, sizeof(CUexternalSemaphore)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamWaitValue32_v2) < 0 || rpc_write(0, &stream, sizeof(CUstream)) < 0 || @@ -6009,11 +8671,19 @@ CUresult cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t va rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamWaitValue64_v2) < 0 || rpc_write(0, &stream, sizeof(CUstream)) < 0 || @@ -6023,11 +8693,19 @@ CUresult cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t va rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamWriteValue32_v2) < 0 || rpc_write(0, &stream, sizeof(CUstream)) < 0 || @@ -6037,11 +8715,19 @@ CUresult cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t v rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamWriteValue64_v2) < 0 || rpc_write(0, &stream, sizeof(CUstream)) < 0 || @@ -6051,11 +8737,19 @@ CUresult cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t v rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&addr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams* paramArray, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuStreamBatchMemOp_v2) < 0 || rpc_write(0, &stream, sizeof(CUstream)) < 0 || @@ -6066,11 +8760,18 @@ CUresult cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatc rpc_read(0, paramArray, sizeof(CUstreamBatchMemOpParams)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc) { + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncGetAttribute) < 0 || rpc_write(0, pi, sizeof(int)) < 0 || @@ -6080,11 +8781,17 @@ CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfu rpc_read(0, pi, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pi, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncSetAttribute) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6093,11 +8800,16 @@ CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int v rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncSetCacheConfig) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6105,11 +8817,15 @@ CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncSetSharedMemConfig) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6117,11 +8833,15 @@ CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncGetModule(CUmodule* hmod, CUfunction hfunc) { + maybe_copy_unified_arg(0, (void*)hmod, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncGetModule) < 0 || rpc_write(0, hmod, sizeof(CUmodule)) < 0 || @@ -6130,11 +8850,24 @@ CUresult cuFuncGetModule(CUmodule* hmod, CUfunction hfunc) rpc_read(0, hmod, sizeof(CUmodule)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)hmod, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra) { + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)extra, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunchKernel) < 0 || rpc_write(0, &f, sizeof(CUfunction)) < 0 || @@ -6151,11 +8884,32 @@ CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDi rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)extra, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams) { + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunchCooperativeKernel) < 0 || rpc_write(0, &f, sizeof(CUfunction)) < 0 || @@ -6172,11 +8926,24 @@ CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned rpc_read(0, kernelParams, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimX, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimY, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDimZ, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimX, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimY, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDimZ, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&sharedMemBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)kernelParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunchCooperativeKernelMultiDevice) < 0 || rpc_write(0, launchParamsList, sizeof(CUDA_LAUNCH_PARAMS)) < 0 || @@ -6186,11 +8953,18 @@ CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsLi rpc_read(0, launchParamsList, sizeof(CUDA_LAUNCH_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&z, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncSetBlockShape) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6200,11 +8974,17 @@ CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&z, cudaMemcpyDeviceToHost); return return_value; } CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuFuncSetSharedSize) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6212,11 +8992,15 @@ CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numbytes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuParamSetSize) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6224,11 +9008,16 @@ CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numbytes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuParamSeti) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6237,11 +9026,17 @@ CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuParamSetf(CUfunction hfunc, int offset, float value) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuParamSetf) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6250,22 +9045,30 @@ CUresult cuParamSetf(CUfunction hfunc, int offset, float value) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunch(CUfunction f) { + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunch) < 0 || rpc_write(0, &f, sizeof(CUfunction)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) { + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunchGrid) < 0 || rpc_write(0, &f, sizeof(CUfunction)) < 0 || @@ -6274,11 +9077,18 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyDeviceToHost); return return_value; } CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuLaunchGridAsync) < 0 || rpc_write(0, &f, sizeof(CUfunction)) < 0 || @@ -6288,11 +9098,18 @@ CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstre rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&f, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&grid_width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&grid_height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texunit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuParamSetTexRef) < 0 || rpc_write(0, &hfunc, sizeof(CUfunction)) < 0 || @@ -6301,11 +9118,16 @@ CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hfunc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texunit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphCreate(CUgraph* phGraph, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphCreate) < 0 || rpc_write(0, phGraph, sizeof(CUgraph)) < 0 || @@ -6314,11 +9136,18 @@ CUresult cuGraphCreate(CUgraph* phGraph, unsigned int flags) rpc_read(0, phGraph, sizeof(CUgraph)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddKernelNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddKernelNode_v2) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6330,11 +9159,18 @@ CUresult cuGraphAddKernelNode_v2(CUgraphNode* phGraphNode, CUgraph hGraph, const rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphKernelNodeGetParams_v2(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphKernelNodeGetParams_v2) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6343,11 +9179,15 @@ CUresult cuGraphKernelNodeGetParams_v2(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAM rpc_read(0, nodeParams, sizeof(CUDA_KERNEL_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphKernelNodeSetParams_v2(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphKernelNodeSetParams_v2) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6355,11 +9195,19 @@ CUresult cuGraphKernelNodeSetParams_v2(CUgraphNode hNode, const CUDA_KERNEL_NODE rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddMemcpyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMCPY3D* copyParams, CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddMemcpyNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6372,11 +9220,19 @@ CUresult cuGraphAddMemcpyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CU rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemcpyNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6385,11 +9241,15 @@ CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams rpc_read(0, nodeParams, sizeof(CUDA_MEMCPY3D)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemcpyNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6397,11 +9257,19 @@ CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D* node rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddMemsetNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddMemsetNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6414,11 +9282,19 @@ CUresult cuGraphAddMemsetNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CU rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemsetNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6427,11 +9303,15 @@ CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS* rpc_read(0, nodeParams, sizeof(CUDA_MEMSET_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemsetNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6439,11 +9319,18 @@ CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PA rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddHostNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddHostNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6455,11 +9342,18 @@ CUresult cuGraphAddHostNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgr rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphHostNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6468,11 +9362,15 @@ CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* node rpc_read(0, nodeParams, sizeof(CUDA_HOST_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphHostNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6480,11 +9378,18 @@ CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddChildGraphNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraph childGraph) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddChildGraphNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6496,11 +9401,18 @@ CUresult cuGraphAddChildGraphNode(CUgraphNode* phGraphNode, CUgraph hGraph, cons rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph* phGraph) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphChildGraphNodeGetGraph) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6509,11 +9421,17 @@ CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph* phGraph) rpc_read(0, phGraph, sizeof(CUgraph)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)phGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddEmptyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddEmptyNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6524,11 +9442,20 @@ CUresult cuGraphAddEmptyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUg rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddEventRecordNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUevent event) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddEventRecordNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6540,11 +9467,18 @@ CUresult cuGraphAddEventRecordNode(CUgraphNode* phGraphNode, CUgraph hGraph, con rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent* event_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphEventRecordNodeGetEvent) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6553,11 +9487,15 @@ CUresult cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent* event_out) rpc_read(0, event_out, sizeof(CUevent)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphEventRecordNodeSetEvent) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6565,11 +9503,18 @@ CUresult cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddEventWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUevent event) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddEventWaitNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6581,11 +9526,18 @@ CUresult cuGraphAddEventWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent* event_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphEventWaitNodeGetEvent) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6594,11 +9546,15 @@ CUresult cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent* event_out) rpc_read(0, event_out, sizeof(CUevent)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphEventWaitNodeSetEvent) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6606,11 +9562,18 @@ CUresult cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddExternalSemaphoresSignalNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddExternalSemaphoresSignalNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6622,11 +9585,18 @@ CUresult cuGraphAddExternalSemaphoresSignalNode(CUgraphNode* phGraphNode, CUgrap rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* params_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresSignalNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6635,11 +9605,15 @@ CUresult cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EX rpc_read(0, params_out, sizeof(CUDA_EXT_SEM_SIGNAL_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresSignalNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6647,11 +9621,18 @@ CUresult cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const C rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddExternalSemaphoresWaitNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddExternalSemaphoresWaitNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6663,11 +9644,18 @@ CUresult cuGraphAddExternalSemaphoresWaitNode(CUgraphNode* phGraphNode, CUgraph rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS* params_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresWaitNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6676,11 +9664,15 @@ CUresult cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_ rpc_read(0, params_out, sizeof(CUDA_EXT_SEM_WAIT_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExternalSemaphoresWaitNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6688,11 +9680,18 @@ CUresult cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUD rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddBatchMemOpNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddBatchMemOpNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6704,11 +9703,18 @@ CUresult cuGraphAddBatchMemOpNode(CUgraphNode* phGraphNode, CUgraph hGraph, cons rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphBatchMemOpNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6717,11 +9723,15 @@ CUresult cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NOD rpc_read(0, nodeParams_out, sizeof(CUDA_BATCH_MEM_OP_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphBatchMemOpNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6729,11 +9739,16 @@ CUresult cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_ rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecBatchMemOpNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -6742,11 +9757,19 @@ CUresult cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddMemAllocNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddMemAllocNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6759,11 +9782,18 @@ CUresult cuGraphAddMemAllocNode(CUgraphNode* phGraphNode, CUgraph hGraph, const rpc_read(0, nodeParams, sizeof(CUDA_MEM_ALLOC_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS* params_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemAllocNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6772,11 +9802,20 @@ CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PAR rpc_read(0, params_out, sizeof(CUDA_MEM_ALLOC_NODE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddMemFreeNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUdeviceptr dptr) { + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numDependencies); i++) + maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddMemFreeNode) < 0 || rpc_write(0, phGraphNode, sizeof(CUgraphNode)) < 0 || @@ -6788,11 +9827,20 @@ CUresult cuGraphAddMemFreeNode(CUgraphNode* phGraphNode, CUgraph hGraph, const C rpc_read(0, phGraphNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numDependencies); i++) + maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr* dptr_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dptr_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphMemFreeNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6801,22 +9849,28 @@ CUresult cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr* dptr_out) rpc_read(0, dptr_out, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dptr_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGraphMemTrim(CUdevice device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGraphMemTrim) < 0 || rpc_write(0, &device, sizeof(CUdevice)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphClone(CUgraph* phGraphClone, CUgraph originalGraph) { + maybe_copy_unified_arg(0, (void*)phGraphClone, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphClone) < 0 || rpc_write(0, phGraphClone, sizeof(CUgraph)) < 0 || @@ -6825,11 +9879,16 @@ CUresult cuGraphClone(CUgraph* phGraphClone, CUgraph originalGraph) rpc_read(0, phGraphClone, sizeof(CUgraph)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphClone, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeFindInClone(CUgraphNode* phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph) { + maybe_copy_unified_arg(0, (void*)phNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOriginalNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hClonedGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeFindInClone) < 0 || rpc_write(0, phNode, sizeof(CUgraphNode)) < 0 || @@ -6839,11 +9898,16 @@ CUresult cuGraphNodeFindInClone(CUgraphNode* phNode, CUgraphNode hOriginalNode, rpc_read(0, phNode, sizeof(CUgraphNode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOriginalNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hClonedGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType* type) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeGetType) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6852,11 +9916,16 @@ CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType* type) rpc_read(0, type, sizeof(CUgraphNodeType)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)type, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphGetNodes) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -6867,11 +9936,17 @@ CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes) rpc_read(0, numNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)rootNodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numRootNodes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphGetRootNodes) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -6882,11 +9957,18 @@ CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* num rpc_read(0, numRootNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)rootNodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numRootNodes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, size_t* numEdges) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphGetEdges) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -6899,11 +9981,18 @@ CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, siz rpc_read(0, numEdges, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numDependencies, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeGetDependencies) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6914,11 +10003,17 @@ CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies rpc_read(0, numDependencies, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numDependencies, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependentNodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numDependentNodes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeGetDependentNodes) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -6929,11 +10024,18 @@ CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentN rpc_read(0, numDependentNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependentNodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numDependentNodes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphAddDependencies) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -6943,11 +10045,19 @@ CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from, const C rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphRemoveDependencies) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -6957,22 +10067,31 @@ CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from, cons rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphDestroyNode(CUgraphNode hNode) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphDestroyNode) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphInstantiateWithFlags(CUgraphExec* phGraphExec, CUgraph hGraph, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphInstantiateWithFlags) < 0 || rpc_write(0, phGraphExec, sizeof(CUgraphExec)) < 0 || @@ -6982,11 +10101,17 @@ CUresult cuGraphInstantiateWithFlags(CUgraphExec* phGraphExec, CUgraph hGraph, u rpc_read(0, phGraphExec, sizeof(CUgraphExec)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphInstantiateWithParams(CUgraphExec* phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS* instantiateParams) { + maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphInstantiateWithParams) < 0 || rpc_write(0, phGraphExec, sizeof(CUgraphExec)) < 0 || @@ -6997,11 +10122,16 @@ CUresult cuGraphInstantiateWithParams(CUgraphExec* phGraphExec, CUgraph hGraph, rpc_read(0, instantiateParams, sizeof(CUDA_GRAPH_INSTANTIATE_PARAMS)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t* flags) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecGetFlags) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7010,11 +10140,16 @@ CUresult cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t* flags) rpc_read(0, flags, sizeof(cuuint64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecKernelNodeSetParams_v2(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecKernelNodeSetParams_v2) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7023,11 +10158,18 @@ CUresult cuGraphExecKernelNodeSetParams_v2(CUgraphExec hGraphExec, CUgraphNode h rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D* copyParams, CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecMemcpyNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7037,11 +10179,19 @@ CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNod rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)copyParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecMemsetNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7051,11 +10201,18 @@ CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNod rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)memsetParams, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ctx, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecHostNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7064,11 +10221,17 @@ CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecChildGraphNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7077,11 +10240,17 @@ CUresult cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecEventRecordNodeSetEvent) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7090,11 +10259,17 @@ CUresult cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecEventWaitNodeSetEvent) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7103,11 +10278,17 @@ CUresult cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hN rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecExternalSemaphoresSignalNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7116,11 +10297,17 @@ CUresult cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecExternalSemaphoresWaitNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7129,11 +10316,17 @@ CUresult cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeSetEnabled) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7142,11 +10335,17 @@ CUresult cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsign rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int* isEnabled) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphNodeGetEnabled) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7156,11 +10355,16 @@ CUresult cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsign rpc_read(0, isEnabled, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphUpload) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7168,11 +10372,15 @@ CUresult cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphLaunch) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7180,33 +10388,42 @@ CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecDestroy(CUgraphExec hGraphExec) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecDestroy) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphDestroy(CUgraph hGraph) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphDestroy) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphExecUpdate_v2(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo* resultInfo) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphExecUpdate_v2) < 0 || rpc_write(0, &hGraphExec, sizeof(CUgraphExec)) < 0 || @@ -7216,11 +10433,16 @@ CUresult cuGraphExecUpdate_v2(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExe rpc_read(0, resultInfo, sizeof(CUgraphExecUpdateResultInfo)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphKernelNodeCopyAttributes) < 0 || rpc_write(0, &dst, sizeof(CUgraphNode)) < 0 || @@ -7228,11 +10450,16 @@ CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue* value_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphKernelNodeGetAttribute) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -7242,11 +10469,17 @@ CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID att rpc_read(0, value_out, sizeof(CUkernelNodeAttrValue)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue* value) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphKernelNodeSetAttribute) < 0 || rpc_write(0, &hNode, sizeof(CUgraphNode)) < 0 || @@ -7255,11 +10488,17 @@ CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID att rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphDebugDotPrint(CUgraph hGraph, const char* path, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphDebugDotPrint) < 0 || rpc_write(0, &hGraph, sizeof(CUgraph)) < 0 || @@ -7268,11 +10507,16 @@ CUresult cuGraphDebugDotPrint(CUgraph hGraph, const char* path, unsigned int fla rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuUserObjectRetain(CUuserObject object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuUserObjectRetain) < 0 || rpc_write(0, &object, sizeof(CUuserObject)) < 0 || @@ -7280,11 +10524,15 @@ CUresult cuUserObjectRetain(CUuserObject object, unsigned int count) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuUserObjectRelease(CUuserObject object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuUserObjectRelease) < 0 || rpc_write(0, &object, sizeof(CUuserObject)) < 0 || @@ -7292,11 +10540,17 @@ CUresult cuUserObjectRelease(CUuserObject object, unsigned int count) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphRetainUserObject) < 0 || rpc_write(0, &graph, sizeof(CUgraph)) < 0 || @@ -7306,11 +10560,18 @@ CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned in rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphReleaseUserObject) < 0 || rpc_write(0, &graph, sizeof(CUgraph)) < 0 || @@ -7319,11 +10580,18 @@ CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned i rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) { + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuOccupancyMaxActiveBlocksPerMultiprocessor) < 0 || rpc_write(0, numBlocks, sizeof(int)) < 0 || @@ -7334,11 +10602,20 @@ CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction rpc_read(0, numBlocks, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost); return return_value; } CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) < 0 || rpc_write(0, numBlocks, sizeof(int)) < 0 || @@ -7350,11 +10627,20 @@ CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CU rpc_read(0, numBlocks, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) { + maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuOccupancyAvailableDynamicSMemPerBlock) < 0 || rpc_write(0, dynamicSmemSize, sizeof(size_t)) < 0 || @@ -7365,11 +10651,18 @@ CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, CUfunc rpc_read(0, dynamicSmemSize, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); return return_value; } CUresult cuOccupancyMaxPotentialClusterSize(int* clusterSize, CUfunction func, const CUlaunchConfig* config) { + maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuOccupancyMaxPotentialClusterSize) < 0 || rpc_write(0, clusterSize, sizeof(int)) < 0 || @@ -7379,11 +10672,17 @@ CUresult cuOccupancyMaxPotentialClusterSize(int* clusterSize, CUfunction func, c rpc_read(0, clusterSize, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuOccupancyMaxActiveClusters(int* numClusters, CUfunction func, const CUlaunchConfig* config) { + maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuOccupancyMaxActiveClusters) < 0 || rpc_write(0, numClusters, sizeof(int)) < 0 || @@ -7393,11 +10692,17 @@ CUresult cuOccupancyMaxActiveClusters(int* numClusters, CUfunction func, const C rpc_read(0, numClusters, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetArray) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7406,11 +10711,17 @@ CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetMipmappedArray) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7419,11 +10730,18 @@ CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmapped rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hMipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetAddress_v2(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) { + maybe_copy_unified_arg(0, (void*)ByteOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetAddress_v2) < 0 || rpc_write(0, ByteOffset, sizeof(size_t)) < 0 || @@ -7434,11 +10752,19 @@ CUresult cuTexRefSetAddress_v2(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr rpc_read(0, ByteOffset, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)ByteOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bytes, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetAddress2D_v3(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Pitch, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetAddress2D_v3) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7448,11 +10774,18 @@ CUresult cuTexRefSetAddress2D_v3(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Pitch, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fmt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&NumPackedComponents, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetFormat) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7461,11 +10794,17 @@ CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedCo rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fmt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&NumPackedComponents, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&am, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetAddressMode) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7474,11 +10813,16 @@ CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&am, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetFilterMode) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7486,11 +10830,15 @@ CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetMipmapFilterMode) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7498,11 +10846,15 @@ CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&fm, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bias, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetMipmapLevelBias) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7510,11 +10862,16 @@ CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bias, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&minMipmapLevelClamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&maxMipmapLevelClamp, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetMipmapLevelClamp) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7523,11 +10880,16 @@ CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&minMipmapLevelClamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&maxMipmapLevelClamp, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&maxAniso, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetMaxAnisotropy) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7535,11 +10897,15 @@ CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&maxAniso, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetBorderColor) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7548,11 +10914,15 @@ CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor) rpc_read(0, pBorderColor, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefSetFlags) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || @@ -7560,11 +10930,15 @@ CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetAddress_v2) < 0 || rpc_write(0, pdptr, sizeof(CUdeviceptr)) < 0 || @@ -7573,11 +10947,15 @@ CUresult cuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef) rpc_read(0, pdptr, sizeof(CUdeviceptr)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pdptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetArray) < 0 || rpc_write(0, phArray, sizeof(CUarray)) < 0 || @@ -7586,11 +10964,15 @@ CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef) rpc_read(0, phArray, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)phMipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetMipmappedArray) < 0 || rpc_write(0, phMipmappedArray, sizeof(CUmipmappedArray)) < 0 || @@ -7599,11 +10981,16 @@ CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref rpc_read(0, phMipmappedArray, sizeof(CUmipmappedArray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phMipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim) { + maybe_copy_unified_arg(0, (void*)pam, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetAddressMode) < 0 || rpc_write(0, pam, sizeof(CUaddress_mode)) < 0 || @@ -7613,11 +11000,16 @@ CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim) rpc_read(0, pam, sizeof(CUaddress_mode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pam, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dim, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetFilterMode) < 0 || rpc_write(0, pfm, sizeof(CUfilter_mode)) < 0 || @@ -7626,11 +11018,16 @@ CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef) rpc_read(0, pfm, sizeof(CUfilter_mode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pFormat, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNumChannels, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetFormat) < 0 || rpc_write(0, pFormat, sizeof(CUarray_format)) < 0 || @@ -7641,11 +11038,16 @@ CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref rpc_read(0, pNumChannels, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pFormat, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNumChannels, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetMipmapFilterMode) < 0 || rpc_write(0, pfm, sizeof(CUfilter_mode)) < 0 || @@ -7654,11 +11056,15 @@ CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef) rpc_read(0, pfm, sizeof(CUfilter_mode)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pfm, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pbias, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetMipmapLevelBias) < 0 || rpc_write(0, pbias, sizeof(float)) < 0 || @@ -7667,11 +11073,16 @@ CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef) rpc_read(0, pbias, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pbias, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pminMipmapLevelClamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pmaxMipmapLevelClamp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetMipmapLevelClamp) < 0 || rpc_write(0, pminMipmapLevelClamp, sizeof(float)) < 0 || @@ -7682,11 +11093,16 @@ CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMip rpc_read(0, pmaxMipmapLevelClamp, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pminMipmapLevelClamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pmaxMipmapLevelClamp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pmaxAniso, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetMaxAnisotropy) < 0 || rpc_write(0, pmaxAniso, sizeof(int)) < 0 || @@ -7695,11 +11111,15 @@ CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef) rpc_read(0, pmaxAniso, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pmaxAniso, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetBorderColor) < 0 || rpc_write(0, pBorderColor, sizeof(float)) < 0 || @@ -7708,11 +11128,15 @@ CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef) rpc_read(0, pBorderColor, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pBorderColor, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefGetFlags) < 0 || rpc_write(0, pFlags, sizeof(unsigned int)) < 0 || @@ -7721,11 +11145,14 @@ CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef) rpc_read(0, pFlags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pFlags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefCreate(CUtexref* pTexRef) { + maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefCreate) < 0 || rpc_write(0, pTexRef, sizeof(CUtexref)) < 0 || @@ -7733,22 +11160,28 @@ CUresult cuTexRefCreate(CUtexref* pTexRef) rpc_read(0, pTexRef, sizeof(CUtexref)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexRefDestroy(CUtexref hTexRef) { + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexRefDestroy) < 0 || rpc_write(0, &hTexRef, sizeof(CUtexref)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hTexRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSurfRefSetArray) < 0 || rpc_write(0, &hSurfRef, sizeof(CUsurfref)) < 0 || @@ -7757,11 +11190,16 @@ CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flag rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef) { + maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSurfRefGetArray) < 0 || rpc_write(0, phArray, sizeof(CUarray)) < 0 || @@ -7770,11 +11208,17 @@ CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef) rpc_read(0, phArray, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)phArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hSurfRef, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc) { + maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexObjectCreate) < 0 || rpc_write(0, pTexObject, sizeof(CUtexObject)) < 0 || @@ -7785,22 +11229,30 @@ CUresult cuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pR rpc_read(0, pTexObject, sizeof(CUtexObject)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexObjectDestroy(CUtexObject texObject) { + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexObjectDestroy) < 0 || rpc_write(0, &texObject, sizeof(CUtexObject)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUtexObject texObject) { + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexObjectGetResourceDesc) < 0 || rpc_write(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 || @@ -7809,11 +11261,15 @@ CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUtexObject te rpc_read(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC* pTexDesc, CUtexObject texObject) { + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexObjectGetTextureDesc) < 0 || rpc_write(0, pTexDesc, sizeof(CUDA_TEXTURE_DESC)) < 0 || @@ -7822,11 +11278,15 @@ CUresult cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC* pTexDesc, CUtexObject texO rpc_read(0, pTexDesc, sizeof(CUDA_TEXTURE_DESC)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC* pResViewDesc, CUtexObject texObject) { + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuTexObjectGetResourceViewDesc) < 0 || rpc_write(0, pResViewDesc, sizeof(CUDA_RESOURCE_VIEW_DESC)) < 0 || @@ -7835,11 +11295,15 @@ CUresult cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC* pResViewDesc, C rpc_read(0, pResViewDesc, sizeof(CUDA_RESOURCE_VIEW_DESC)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSurfObjectCreate(CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC* pResDesc) { + maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSurfObjectCreate) < 0 || rpc_write(0, pSurfObject, sizeof(CUsurfObject)) < 0 || @@ -7848,22 +11312,28 @@ CUresult cuSurfObjectCreate(CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC* rpc_read(0, pSurfObject, sizeof(CUsurfObject)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSurfObjectDestroy(CUsurfObject surfObject) { + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSurfObjectDestroy) < 0 || rpc_write(0, &surfObject, sizeof(CUsurfObject)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject) { + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuSurfObjectGetResourceDesc) < 0 || rpc_write(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 || @@ -7872,11 +11342,16 @@ CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject rpc_read(0, pResDesc, sizeof(CUDA_RESOURCE_DESC)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev) { + maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&peerDev, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceCanAccessPeer) < 0 || rpc_write(0, canAccessPeer, sizeof(int)) < 0 || @@ -7886,11 +11361,16 @@ CUresult cuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDe rpc_read(0, canAccessPeer, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dev, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&peerDev, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) { + maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxEnablePeerAccess) < 0 || rpc_write(0, &peerContext, sizeof(CUcontext)) < 0 || @@ -7898,22 +11378,30 @@ CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuCtxDisablePeerAccess(CUcontext peerContext) { + maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuCtxDisablePeerAccess) < 0 || rpc_write(0, &peerContext, sizeof(CUcontext)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&peerContext, cudaMemcpyDeviceToHost); return return_value; } CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) { + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuDeviceGetP2PAttribute) < 0 || rpc_write(0, value, sizeof(int)) < 0 || @@ -7924,22 +11412,32 @@ CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdev rpc_read(0, value, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attrib, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource) { + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsUnregisterResource) < 0 || rpc_write(0, &resource, sizeof(CUgraphicsResource)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel) { + maybe_copy_unified_arg(0, (void*)pArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsSubResourceGetMappedArray) < 0 || rpc_write(0, pArray, sizeof(CUarray)) < 0 || @@ -7950,11 +11448,17 @@ CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource rpc_read(0, pArray, sizeof(CUarray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource) { + maybe_copy_unified_arg(0, (void*)pMipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsResourceGetMappedMipmappedArray) < 0 || rpc_write(0, pMipmappedArray, sizeof(CUmipmappedArray)) < 0 || @@ -7963,11 +11467,16 @@ CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedA rpc_read(0, pMipmappedArray, sizeof(CUmipmappedArray)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pMipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDevPtr, size_t* pSize, CUgraphicsResource resource) { + maybe_copy_unified_arg(0, (void*)pDevPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsResourceGetMappedPointer_v2) < 0 || rpc_write(0, pDevPtr, sizeof(CUdeviceptr)) < 0 || @@ -7978,11 +11487,16 @@ CUresult cuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDevPtr, size_t* pSi rpc_read(0, pSize, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)pDevPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsResourceSetMapFlags_v2) < 0 || rpc_write(0, &resource, sizeof(CUgraphicsResource)) < 0 || @@ -7990,11 +11504,16 @@ CUresult cuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsMapResources) < 0 || rpc_write(0, &count, sizeof(unsigned int)) < 0 || @@ -8004,11 +11523,17 @@ CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource* resource rpc_read(0, resources, sizeof(CUgraphicsResource)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream) { + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); CUresult return_value; if (rpc_start_request(0, RPC_cuGraphicsUnmapResources) < 0 || rpc_write(0, &count, sizeof(unsigned int)) < 0 || @@ -8018,6 +11543,9 @@ CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resour rpc_read(0, resources, sizeof(CUgraphicsResource)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDA_ERROR_DEVICE_UNAVAILABLE; + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } @@ -8043,6 +11571,8 @@ cudaError_t cudaDeviceSynchronize() cudaError_t cudaDeviceSetLimit(enum cudaLimit limit, size_t value) { + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceSetLimit) < 0 || rpc_write(0, &limit, sizeof(enum cudaLimit)) < 0 || @@ -8050,11 +11580,15 @@ cudaError_t cudaDeviceSetLimit(enum cudaLimit limit, size_t value) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetLimit(size_t* pValue, enum cudaLimit limit) { + maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetLimit) < 0 || rpc_write(0, pValue, sizeof(size_t)) < 0 || @@ -8063,11 +11597,16 @@ cudaError_t cudaDeviceGetLimit(size_t* pValue, enum cudaLimit limit) rpc_read(0, pValue, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, const struct cudaChannelFormatDesc* fmtDesc, int device) { + maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fmtDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetTexture1DLinearMaxWidth) < 0 || rpc_write(0, maxWidthInElements, sizeof(size_t)) < 0 || @@ -8077,11 +11616,15 @@ cudaError_t cudaDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements, con rpc_read(0, maxWidthInElements, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)maxWidthInElements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fmtDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache* pCacheConfig) { + maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetCacheConfig) < 0 || rpc_write(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 || @@ -8089,11 +11632,14 @@ cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache* pCacheConfig) rpc_read(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) { + maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetStreamPriorityRange) < 0 || rpc_write(0, leastPriority, sizeof(int)) < 0 || @@ -8103,22 +11649,27 @@ cudaError_t cudaDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPr rpc_read(0, greatestPriority, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)leastPriority, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)greatestPriority, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) { + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceSetCacheConfig) < 0 || rpc_write(0, &cacheConfig, sizeof(enum cudaFuncCache)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig* pConfig) { + maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetSharedMemConfig) < 0 || rpc_write(0, pConfig, sizeof(enum cudaSharedMemConfig)) < 0 || @@ -8126,22 +11677,27 @@ cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig* pConfig) rpc_read(0, pConfig, sizeof(enum cudaSharedMemConfig)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) { + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceSetSharedMemConfig) < 0 || rpc_write(0, &config, sizeof(enum cudaSharedMemConfig)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetByPCIBusId(int* device, const char* pciBusId) { + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetByPCIBusId) < 0 || rpc_write(0, device, sizeof(int)) < 0 || @@ -8150,11 +11706,16 @@ cudaError_t cudaDeviceGetByPCIBusId(int* device, const char* pciBusId) rpc_read(0, device, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetPCIBusId(char* pciBusId, int len, int device) { + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetPCIBusId) < 0 || rpc_write(0, pciBusId, sizeof(char)) < 0 || @@ -8164,11 +11725,16 @@ cudaError_t cudaDeviceGetPCIBusId(char* pciBusId, int len, int device) rpc_read(0, pciBusId, sizeof(char)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pciBusId, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaIpcGetEventHandle) < 0 || rpc_write(0, handle, sizeof(cudaIpcEventHandle_t)) < 0 || @@ -8177,11 +11743,15 @@ cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t* handle, cudaEvent_t even rpc_read(0, handle, sizeof(cudaIpcEventHandle_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t handle) { + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaIpcOpenEventHandle) < 0 || rpc_write(0, event, sizeof(cudaEvent_t)) < 0 || @@ -8190,11 +11760,16 @@ cudaError_t cudaIpcOpenEventHandle(cudaEvent_t* event, cudaIpcEventHandle_t hand rpc_read(0, event, sizeof(cudaEvent_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaIpcOpenMemHandle) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -8204,11 +11779,16 @@ cudaError_t cudaIpcOpenMemHandle(void** devPtr, cudaIpcMemHandle_t handle, unsig rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope) { + maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceFlushGPUDirectRDMAWrites) < 0 || rpc_write(0, &target, sizeof(enum cudaFlushGPUDirectRDMAWritesTarget)) < 0 || @@ -8216,6 +11796,8 @@ cudaError_t cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWrites rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&target, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&scope, cudaMemcpyDeviceToHost); return return_value; } @@ -8241,6 +11823,8 @@ cudaError_t cudaThreadSynchronize() cudaError_t cudaThreadSetLimit(enum cudaLimit limit, size_t value) { + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaThreadSetLimit) < 0 || rpc_write(0, &limit, sizeof(enum cudaLimit)) < 0 || @@ -8248,11 +11832,15 @@ cudaError_t cudaThreadSetLimit(enum cudaLimit limit, size_t value) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaThreadGetLimit(size_t* pValue, enum cudaLimit limit) { + maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaThreadGetLimit) < 0 || rpc_write(0, pValue, sizeof(size_t)) < 0 || @@ -8261,11 +11849,14 @@ cudaError_t cudaThreadGetLimit(size_t* pValue, enum cudaLimit limit) rpc_read(0, pValue, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pValue, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&limit, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaThreadGetCacheConfig(enum cudaFuncCache* pCacheConfig) { + maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaThreadGetCacheConfig) < 0 || rpc_write(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 || @@ -8273,17 +11864,20 @@ cudaError_t cudaThreadGetCacheConfig(enum cudaFuncCache* pCacheConfig) rpc_read(0, pCacheConfig, sizeof(enum cudaFuncCache)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pCacheConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) { + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaThreadSetCacheConfig) < 0 || rpc_write(0, &cacheConfig, sizeof(enum cudaFuncCache)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost); return return_value; } @@ -8309,17 +11903,21 @@ cudaError_t cudaPeekAtLastError() cudaError_t cudaGetDeviceCount(int* count) { + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetDeviceCount) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, count, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device) { + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetDeviceProperties_v2) < 0 || rpc_write(0, &device, sizeof(int)) < 0 || @@ -8327,11 +11925,16 @@ cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device) rpc_read(0, prop, sizeof(struct cudaDeviceProp)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetAttribute(int* value, enum cudaDeviceAttr attr, int device) { + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetAttribute) < 0 || rpc_write(0, value, sizeof(int)) < 0 || @@ -8341,11 +11944,16 @@ cudaError_t cudaDeviceGetAttribute(int* value, enum cudaDeviceAttr attr, int dev rpc_read(0, value, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) { + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetDefaultMemPool) < 0 || rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 || @@ -8354,11 +11962,15 @@ cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t* memPool, int device) rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceSetMemPool) < 0 || rpc_write(0, &device, sizeof(int)) < 0 || @@ -8366,11 +11978,15 @@ cudaError_t cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetMemPool(cudaMemPool_t* memPool, int device) { + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetMemPool) < 0 || rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 || @@ -8379,11 +11995,17 @@ cudaError_t cudaDeviceGetMemPool(cudaMemPool_t* memPool, int device) rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGetP2PAttribute(int* value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) { + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGetP2PAttribute) < 0 || rpc_write(0, value, sizeof(int)) < 0 || @@ -8394,11 +12016,17 @@ cudaError_t cudaDeviceGetP2PAttribute(int* value, enum cudaDeviceP2PAttr attr, i rpc_read(0, value, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaChooseDevice(int* device, const struct cudaDeviceProp* prop) { + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaChooseDevice) < 0 || rpc_write(0, device, sizeof(int)) < 0 || @@ -8407,11 +12035,16 @@ cudaError_t cudaChooseDevice(int* device, const struct cudaDeviceProp* prop) rpc_read(0, device, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)prop, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&deviceFlags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaInitDevice) < 0 || rpc_write(0, &device, sizeof(int)) < 0 || @@ -8420,22 +12053,28 @@ cudaError_t cudaInitDevice(int device, unsigned int deviceFlags, unsigned int fl rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&deviceFlags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSetDevice(int device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSetDevice) < 0 || rpc_write(0, &device, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetDevice(int* device) { + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetDevice) < 0 || rpc_write(0, device, sizeof(int)) < 0 || @@ -8443,11 +12082,14 @@ cudaError_t cudaGetDevice(int* device) rpc_read(0, device, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSetValidDevices(int* device_arr, int len) { + maybe_copy_unified_arg(0, (void*)device_arr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSetValidDevices) < 0 || rpc_write(0, device_arr, sizeof(int)) < 0 || @@ -8456,22 +12098,27 @@ cudaError_t cudaSetValidDevices(int* device_arr, int len) rpc_read(0, device_arr, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)device_arr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&len, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSetDeviceFlags(unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSetDeviceFlags) < 0 || rpc_write(0, &flags, sizeof(unsigned int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetDeviceFlags(unsigned int* flags) { + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetDeviceFlags) < 0 || rpc_write(0, flags, sizeof(unsigned int)) < 0 || @@ -8479,11 +12126,13 @@ cudaError_t cudaGetDeviceFlags(unsigned int* flags) rpc_read(0, flags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamCreate(cudaStream_t* pStream) { + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamCreate) < 0 || rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 || @@ -8491,11 +12140,14 @@ cudaError_t cudaStreamCreate(cudaStream_t* pStream) rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamCreateWithFlags) < 0 || rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 || @@ -8504,11 +12156,16 @@ cudaError_t cudaStreamCreateWithFlags(cudaStream_t* pStream, unsigned int flags) rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamCreateWithPriority(cudaStream_t* pStream, unsigned int flags, int priority) { + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamCreateWithPriority) < 0 || rpc_write(0, pStream, sizeof(cudaStream_t)) < 0 || @@ -8518,11 +12175,16 @@ cudaError_t cudaStreamCreateWithPriority(cudaStream_t* pStream, unsigned int fla rpc_read(0, pStream, sizeof(cudaStream_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&priority, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamGetPriority) < 0 || rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 || @@ -8531,11 +12193,15 @@ cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority) rpc_read(0, priority, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)priority, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int* flags) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamGetFlags) < 0 || rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 || @@ -8544,11 +12210,15 @@ cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int* flags) rpc_read(0, flags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamGetId(cudaStream_t hStream, unsigned long long* streamId) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamGetId) < 0 || rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 || @@ -8557,6 +12227,8 @@ cudaError_t cudaStreamGetId(cudaStream_t hStream, unsigned long long* streamId) rpc_read(0, streamId, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost); return return_value; } @@ -8572,6 +12244,8 @@ cudaError_t cudaCtxResetPersistingL2Cache() cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamCopyAttributes) < 0 || rpc_write(0, &dst, sizeof(cudaStream_t)) < 0 || @@ -8579,11 +12253,16 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamGetAttribute(cudaStream_t hStream, cudaLaunchAttributeID attr, cudaLaunchAttributeValue* value_out) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamGetAttribute) < 0 || rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 || @@ -8593,11 +12272,17 @@ cudaError_t cudaStreamGetAttribute(cudaStream_t hStream, cudaLaunchAttributeID a rpc_read(0, value_out, sizeof(cudaLaunchAttributeValue)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamSetAttribute(cudaStream_t hStream, cudaLaunchAttributeID attr, const cudaLaunchAttributeValue* value) { + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamSetAttribute) < 0 || rpc_write(0, &hStream, sizeof(cudaStream_t)) < 0 || @@ -8606,22 +12291,30 @@ cudaError_t cudaStreamSetAttribute(cudaStream_t hStream, cudaLaunchAttributeID a rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamDestroy(cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamDestroy) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamWaitEvent) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8630,33 +12323,42 @@ cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamSynchronize(cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamSynchronize) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamQuery(cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamQuery) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamBeginCapture) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8664,11 +12366,14 @@ cudaError_t cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMo rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode* mode) { + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaThreadExchangeStreamCaptureMode) < 0 || rpc_write(0, mode, sizeof(enum cudaStreamCaptureMode)) < 0 || @@ -8676,11 +12381,14 @@ cudaError_t cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode* mode rpc_read(0, mode, sizeof(enum cudaStreamCaptureMode)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamEndCapture) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8689,11 +12397,15 @@ cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph) rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus* pCaptureStatus) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pCaptureStatus, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamIsCapturing) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8702,11 +12414,21 @@ cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureSta rpc_read(0, pCaptureStatus, sizeof(enum cudaStreamCaptureStatus)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pCaptureStatus, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus* captureStatus_out, unsigned long long* id_out, cudaGraph_t* graph_out, const cudaGraphNode_t** dependencies_out, size_t* numDependencies_out) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)captureStatus_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)id_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)graph_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numDependencies_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies_out, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(*numDependencies_out); i++) + maybe_copy_unified_arg(0, (void*)&dependencies_out[i], cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamGetCaptureInfo_v2) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8718,11 +12440,25 @@ cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCapt rpc_read(0, dependencies_out, *numDependencies_out * sizeof(const cudaGraphNode_t*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)captureStatus_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)id_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)graph_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numDependencies_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies_out, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(*numDependencies_out); i++) + maybe_copy_unified_arg(0, (void*)&dependencies_out[i], cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t* dependencies, size_t numDependencies, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(numDependencies); i++) + maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaStreamUpdateCaptureDependencies) < 0 || rpc_write(0, &stream, sizeof(cudaStream_t)) < 0 || @@ -8732,22 +12468,32 @@ cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNo rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dependencies, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(numDependencies); i++) + maybe_copy_unified_arg(0, (void*)&dependencies[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventCreate(cudaEvent_t* event) { + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventCreate) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, event, sizeof(cudaEvent_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventCreateWithFlags) < 0 || rpc_write(0, &flags, sizeof(unsigned int)) < 0 || @@ -8755,11 +12501,15 @@ cudaError_t cudaEventCreateWithFlags(cudaEvent_t* event, unsigned int flags) rpc_read(0, event, sizeof(cudaEvent_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)event, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventRecord) < 0 || rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 || @@ -8767,11 +12517,16 @@ cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventRecordWithFlags) < 0 || rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 || @@ -8780,44 +12535,56 @@ cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, uns rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventQuery(cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventQuery) < 0 || rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventSynchronize(cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventSynchronize) < 0 || rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventDestroy(cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventDestroy) < 0 || rpc_write(0, &event, sizeof(cudaEvent_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) { + maybe_copy_unified_arg(0, (void*)ms, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&start, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&end, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaEventElapsedTime) < 0 || rpc_write(0, &start, sizeof(cudaEvent_t)) < 0 || @@ -8826,11 +12593,17 @@ cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t end) rpc_read(0, ms, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ms, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&start, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&end, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaExternalMemoryGetMappedBuffer(void** devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc* bufferDesc) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaExternalMemoryGetMappedBuffer) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -8840,11 +12613,17 @@ cudaError_t cudaExternalMemoryGetMappedBuffer(void** devPtr, cudaExternalMemory_ rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)bufferDesc, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t* mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc* mipmapDesc) { + maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaExternalMemoryGetMappedMipmappedArray) < 0 || rpc_write(0, mipmap, sizeof(cudaMipmappedArray_t)) < 0 || @@ -8854,22 +12633,29 @@ cudaError_t cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t* mipm rpc_read(0, mipmap, sizeof(cudaMipmappedArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)mipmap, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mipmapDesc, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDestroyExternalMemory(cudaExternalMemory_t extMem) { + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDestroyExternalMemory) < 0 || rpc_write(0, &extMem, sizeof(cudaExternalMemory_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&extMem, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, const struct cudaExternalSemaphoreHandleDesc* semHandleDesc) { + maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaImportExternalSemaphore) < 0 || rpc_write(0, extSem_out, sizeof(cudaExternalSemaphore_t)) < 0 || @@ -8878,11 +12664,17 @@ cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t* extSem_out, con rpc_read(0, extSem_out, sizeof(cudaExternalSemaphore_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)extSem_out, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)semHandleDesc, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const struct cudaExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSignalExternalSemaphoresAsync_v2) < 0 || rpc_write(0, &extSemArray, sizeof(const cudaExternalSemaphore_t*)) < 0 || @@ -8892,11 +12684,19 @@ cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* extSemArray, const struct cudaExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaWaitExternalSemaphoresAsync_v2) < 0 || rpc_write(0, &extSemArray, sizeof(const cudaExternalSemaphore_t*)) < 0 || @@ -8906,22 +12706,31 @@ cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t* ex rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)extSemArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)paramsArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numExtSems, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) { + maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDestroyExternalSemaphore) < 0 || rpc_write(0, &extSem, sizeof(cudaExternalSemaphore_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&extSem, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t* config, const void* func, void** args) { + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)args, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaLaunchKernelExC) < 0 || rpc_write(0, &config, sizeof(const cudaLaunchConfig_t*)) < 0 || @@ -8931,11 +12740,20 @@ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t* config, const void* fu rpc_read(0, args, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)config, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)args, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaLaunchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&gridDim, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockDim, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)args, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&sharedMem, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaLaunchCooperativeKernel) < 0 || rpc_write(0, &func, sizeof(const void*)) < 0 || @@ -8948,11 +12766,20 @@ cudaError_t cudaLaunchCooperativeKernel(const void* func, dim3 gridDim, dim3 blo rpc_read(0, args, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&gridDim, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockDim, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)args, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&sharedMem, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaLaunchCooperativeKernelMultiDevice) < 0 || rpc_write(0, launchParamsList, sizeof(struct cudaLaunchParams)) < 0 || @@ -8962,11 +12789,16 @@ cudaError_t cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams* laun rpc_read(0, launchParamsList, sizeof(struct cudaLaunchParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)launchParamsList, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDevices, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFuncSetCacheConfig(const void* func, enum cudaFuncCache cacheConfig) { + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFuncSetCacheConfig) < 0 || rpc_write(0, &func, sizeof(const void*)) < 0 || @@ -8974,11 +12806,15 @@ cudaError_t cudaFuncSetCacheConfig(const void* func, enum cudaFuncCache cacheCon rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cacheConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFuncSetSharedMemConfig(const void* func, enum cudaSharedMemConfig config) { + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFuncSetSharedMemConfig) < 0 || rpc_write(0, &func, sizeof(const void*)) < 0 || @@ -8986,11 +12822,15 @@ cudaError_t cudaFuncSetSharedMemConfig(const void* func, enum cudaSharedMemConfi rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&config, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* func) { + maybe_copy_unified_arg(0, (void*)attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFuncGetAttributes) < 0 || rpc_write(0, attr, sizeof(struct cudaFuncAttributes)) < 0 || @@ -8999,11 +12839,16 @@ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* f rpc_read(0, attr, sizeof(struct cudaFuncAttributes)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFuncSetAttribute(const void* func, enum cudaFuncAttribute attr, int value) { + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFuncSetAttribute) < 0 || rpc_write(0, &func, sizeof(const void*)) < 0 || @@ -9012,11 +12857,15 @@ cudaError_t cudaFuncSetAttribute(const void* func, enum cudaFuncAttribute attr, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSetDoubleForDevice(double* d) { + maybe_copy_unified_arg(0, (void*)d, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSetDoubleForDevice) < 0 || rpc_write(0, d, sizeof(double)) < 0 || @@ -9024,11 +12873,13 @@ cudaError_t cudaSetDoubleForDevice(double* d) rpc_read(0, d, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)d, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaSetDoubleForHost(double* d) { + maybe_copy_unified_arg(0, (void*)d, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaSetDoubleForHost) < 0 || rpc_write(0, d, sizeof(double)) < 0 || @@ -9036,11 +12887,16 @@ cudaError_t cudaSetDoubleForHost(double* d) rpc_read(0, d, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)d, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const void* func, int blockSize, size_t dynamicSMemSize) { + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveBlocksPerMultiprocessor) < 0 || rpc_write(0, numBlocks, sizeof(int)) < 0 || @@ -9051,11 +12907,19 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const rpc_read(0, numBlocks, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, const void* func, int numBlocks, int blockSize) { + maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaOccupancyAvailableDynamicSMemPerBlock) < 0 || rpc_write(0, dynamicSmemSize, sizeof(size_t)) < 0 || @@ -9066,11 +12930,20 @@ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, c rpc_read(0, dynamicSmemSize, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)dynamicSmemSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, const void* func, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) < 0 || rpc_write(0, numBlocks, sizeof(int)) < 0 || @@ -9082,11 +12955,19 @@ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlock rpc_read(0, numBlocks, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)numBlocks, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&blockSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dynamicSMemSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaOccupancyMaxPotentialClusterSize(int* clusterSize, const void* func, const cudaLaunchConfig_t* launchConfig) { + maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaOccupancyMaxPotentialClusterSize) < 0 || rpc_write(0, clusterSize, sizeof(int)) < 0 || @@ -9096,11 +12977,17 @@ cudaError_t cudaOccupancyMaxPotentialClusterSize(int* clusterSize, const void* f rpc_read(0, clusterSize, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)clusterSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, const cudaLaunchConfig_t* launchConfig) { + maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaOccupancyMaxActiveClusters) < 0 || rpc_write(0, numClusters, sizeof(int)) < 0 || @@ -9110,11 +12997,16 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c rpc_read(0, numClusters, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)numClusters, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)func, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)launchConfig, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMalloc(void** devPtr, size_t size) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMalloc) < 0 || rpc_write(0, &size, sizeof(size_t)) < 0 || @@ -9122,11 +13014,15 @@ cudaError_t cudaMalloc(void** devPtr, size_t size) rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocHost(void** ptr, size_t size) { + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocHost) < 0 || rpc_write(0, ptr, sizeof(void*)) < 0 || @@ -9135,11 +13031,17 @@ cudaError_t cudaMallocHost(void** ptr, size_t size) rpc_read(0, ptr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocPitch) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -9151,11 +13053,20 @@ cudaError_t cudaMallocPitch(void** devPtr, size_t* pitch, size_t width, size_t h rpc_read(0, pitch, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDesc* desc, size_t width, size_t height, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocArray) < 0 || rpc_write(0, array, sizeof(cudaArray_t)) < 0 || @@ -9167,44 +13078,58 @@ cudaError_t cudaMallocArray(cudaArray_t* array, const struct cudaChannelFormatDe rpc_read(0, array, sizeof(cudaArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFreeHost(void* ptr) { + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFreeHost) < 0 || rpc_write(0, &ptr, sizeof(void*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFreeArray(cudaArray_t array) { + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFreeArray) < 0 || rpc_write(0, &array, sizeof(cudaArray_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) { + maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaFreeMipmappedArray) < 0 || rpc_write(0, &mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)pHost, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaHostAlloc) < 0 || rpc_write(0, pHost, sizeof(void*)) < 0 || @@ -9214,11 +13139,16 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) rpc_read(0, pHost, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pHost, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) { + maybe_copy_unified_arg(0, (void*)pitchedDevPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMalloc3D) < 0 || rpc_write(0, pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 || @@ -9227,11 +13157,17 @@ cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent rpc_read(0, pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pitchedDevPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMalloc3DArray(cudaArray_t* array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMalloc3DArray) < 0 || rpc_write(0, array, sizeof(cudaArray_t)) < 0 || @@ -9242,11 +13178,20 @@ cudaError_t cudaMalloc3DArray(cudaArray_t* array, const struct cudaChannelFormat rpc_read(0, array, sizeof(cudaArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocMipmappedArray(cudaMipmappedArray_t* mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numLevels, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocMipmappedArray) < 0 || rpc_write(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 || @@ -9258,11 +13203,19 @@ cudaError_t cudaMallocMipmappedArray(cudaMipmappedArray_t* mipmappedArray, const rpc_read(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numLevels, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t* levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) { + maybe_copy_unified_arg(0, (void*)levelArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetMipmappedArrayLevel) < 0 || rpc_write(0, levelArray, sizeof(cudaArray_t)) < 0 || @@ -9272,33 +13225,42 @@ cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t* levelArray, cudaMipmappedArr rpc_read(0, levelArray, sizeof(cudaArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)levelArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&level, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms* p) { + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy3D) < 0 || rpc_write(0, &p, sizeof(const struct cudaMemcpy3DParms*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms* p) { + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy3DPeer) < 0 || rpc_write(0, &p, sizeof(const struct cudaMemcpy3DPeerParms*)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms* p, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy3DAsync) < 0 || rpc_write(0, &p, sizeof(const struct cudaMemcpy3DParms*)) < 0 || @@ -9306,11 +13268,15 @@ cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms* p, cudaStream_t st rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms* p, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy3DPeerAsync) < 0 || rpc_write(0, &p, sizeof(const struct cudaMemcpy3DPeerParms*)) < 0 || @@ -9318,11 +13284,15 @@ cudaError_t cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms* p, cudaStr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)p, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemGetInfo(size_t* free, size_t* total) { + maybe_copy_unified_arg(0, (void*)free, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)total, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemGetInfo) < 0 || rpc_write(0, free, sizeof(size_t)) < 0 || @@ -9332,11 +13302,17 @@ cudaError_t cudaMemGetInfo(size_t* free, size_t* total) rpc_read(0, total, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)free, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)total, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaArrayGetInfo(struct cudaChannelFormatDesc* desc, struct cudaExtent* extent, unsigned int* flags, cudaArray_t array) { + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)extent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaArrayGetInfo) < 0 || rpc_write(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 || @@ -9349,11 +13325,18 @@ cudaError_t cudaArrayGetInfo(struct cudaChannelFormatDesc* desc, struct cudaExte rpc_read(0, flags, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)extent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaArrayGetPlane(cudaArray_t* pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) { + maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaArrayGetPlane) < 0 || rpc_write(0, pPlaneArray, sizeof(cudaArray_t)) < 0 || @@ -9363,11 +13346,17 @@ cudaError_t cudaArrayGetPlane(cudaArray_t* pPlaneArray, cudaArray_t hArray, unsi rpc_read(0, pPlaneArray, sizeof(cudaArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pPlaneArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&planeIdx, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* memoryRequirements, cudaArray_t array, int device) { + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaArrayGetMemoryRequirements) < 0 || rpc_write(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 || @@ -9377,11 +13366,17 @@ cudaError_t cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* m rpc_read(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements* memoryRequirements, cudaMipmappedArray_t mipmap, int device) { + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMipmappedArrayGetMemoryRequirements) < 0 || rpc_write(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 || @@ -9391,11 +13386,16 @@ cudaError_t cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequir rpc_read(0, memoryRequirements, sizeof(struct cudaArrayMemoryRequirements)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)memoryRequirements, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaArrayGetSparseProperties(struct cudaArraySparseProperties* sparseProperties, cudaArray_t array) { + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaArrayGetSparseProperties) < 0 || rpc_write(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 || @@ -9404,11 +13404,15 @@ cudaError_t cudaArrayGetSparseProperties(struct cudaArraySparseProperties* spars rpc_read(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties* sparseProperties, cudaMipmappedArray_t mipmap) { + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMipmappedArrayGetSparseProperties) < 0 || rpc_write(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 || @@ -9417,11 +13421,21 @@ cudaError_t cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperti rpc_read(0, sparseProperties, sizeof(struct cudaArraySparseProperties)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)sparseProperties, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipmap, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy2DToArray) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9435,11 +13449,28 @@ cudaError_t cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy2DArrayToArray) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9454,11 +13485,25 @@ cudaError_t cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpyToSymbol(const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpyToSymbol) < 0 || rpc_write(0, &symbol, sizeof(const void*)) < 0 || @@ -9469,11 +13514,25 @@ cudaError_t cudaMemcpyToSymbol(const void* symbol, const void* src, size_t count rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpy2DToArrayAsync) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9488,11 +13547,26 @@ cudaError_t cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOf rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&spitch, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&width, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&height, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpyToSymbolAsync(const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpyToSymbolAsync) < 0 || rpc_write(0, &symbol, sizeof(const void*)) < 0 || @@ -9504,11 +13578,20 @@ cudaError_t cudaMemcpyToSymbolAsync(const void* symbol, const void* src, size_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) { + maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemset3D) < 0 || rpc_write(0, &pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 || @@ -9517,11 +13600,18 @@ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemset3DAsync) < 0 || rpc_write(0, &pitchedDevPtr, sizeof(struct cudaPitchedPtr)) < 0 || @@ -9531,11 +13621,17 @@ cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, st rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&pitchedDevPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&value, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&extent, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetSymbolAddress(void** devPtr, const void* symbol) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetSymbolAddress) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -9544,11 +13640,15 @@ cudaError_t cudaGetSymbolAddress(void** devPtr, const void* symbol) rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetSymbolSize(size_t* size, const void* symbol) { + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetSymbolSize) < 0 || rpc_write(0, size, sizeof(size_t)) < 0 || @@ -9557,11 +13657,17 @@ cudaError_t cudaGetSymbolSize(size_t* size, const void* symbol) rpc_read(0, size, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPrefetchAsync) < 0 || rpc_write(0, &devPtr, sizeof(const void*)) < 0 || @@ -9571,11 +13677,19 @@ cudaError_t cudaMemPrefetchAsync(const void* devPtr, size_t count, int dstDevice rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dstDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvise advice, int device) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemAdvise) < 0 || rpc_write(0, &devPtr, sizeof(const void*)) < 0 || @@ -9585,11 +13699,21 @@ cudaError_t cudaMemAdvise(const void* devPtr, size_t count, enum cudaMemoryAdvis rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&advice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemRangeGetAttributes(void** data, size_t* dataSizes, enum cudaMemRangeAttribute* attributes, size_t numAttributes, const void* devPtr, size_t count) { + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemRangeGetAttributes) < 0 || rpc_write(0, data, sizeof(void*)) < 0 || @@ -9604,11 +13728,23 @@ cudaError_t cudaMemRangeGetAttributes(void** data, size_t* dataSizes, enum cudaM rpc_read(0, attributes, sizeof(enum cudaMemRangeAttribute)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)data, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dataSizes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numAttributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpyToArray) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9620,11 +13756,25 @@ cudaError_t cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, c rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpyArrayToArray) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9638,11 +13788,26 @@ cudaError_t cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hO rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffsetDst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffsetDst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffsetSrc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffsetSrc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemcpyToArrayAsync) < 0 || rpc_write(0, &dst, sizeof(cudaArray_t)) < 0 || @@ -9655,11 +13820,21 @@ cudaError_t cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffs rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&dst, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hOffset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocAsync(void** devPtr, size_t size, cudaStream_t hStream) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocAsync) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -9669,11 +13844,16 @@ cudaError_t cudaMallocAsync(void** devPtr, size_t size, cudaStream_t hStream) rpc_read(0, devPtr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep) { + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolTrimTo) < 0 || rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 || @@ -9681,11 +13861,16 @@ cudaError_t cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&minBytesToKeep, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc* descList, size_t count) { + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)descList, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolSetAccess) < 0 || rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 || @@ -9694,11 +13879,17 @@ cudaError_t cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAcce rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)descList, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolGetAccess(enum cudaMemAccessFlags* flags, cudaMemPool_t memPool, struct cudaMemLocation* location) { + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolGetAccess) < 0 || rpc_write(0, flags, sizeof(enum cudaMemAccessFlags)) < 0 || @@ -9709,11 +13900,16 @@ cudaError_t cudaMemPoolGetAccess(enum cudaMemAccessFlags* flags, cudaMemPool_t m rpc_read(0, location, sizeof(struct cudaMemLocation)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)location, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolCreate(cudaMemPool_t* memPool, const struct cudaMemPoolProps* poolProps) { + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolCreate) < 0 || rpc_write(0, memPool, sizeof(cudaMemPool_t)) < 0 || @@ -9722,22 +13918,30 @@ cudaError_t cudaMemPoolCreate(cudaMemPool_t* memPool, const struct cudaMemPoolPr rpc_read(0, memPool, sizeof(cudaMemPool_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)poolProps, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool) { + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolDestroy) < 0 || rpc_write(0, &memPool, sizeof(cudaMemPool_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMallocFromPoolAsync) < 0 || rpc_write(0, ptr, sizeof(void*)) < 0 || @@ -9748,11 +13952,18 @@ cudaError_t cudaMallocFromPoolAsync(void** ptr, size_t size, cudaMemPool_t memPo rpc_read(0, ptr, sizeof(void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaMemPoolImportPointer(void** ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData* exportData) { + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)exportData, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaMemPoolImportPointer) < 0 || rpc_write(0, ptr, sizeof(void*)) < 0 || @@ -9763,11 +13974,16 @@ cudaError_t cudaMemPoolImportPointer(void** ptr, cudaMemPool_t memPool, struct c rpc_read(0, exportData, sizeof(struct cudaMemPoolPtrExportData)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&memPool, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)exportData, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaPointerGetAttributes(struct cudaPointerAttributes* attributes, const void* ptr) { + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaPointerGetAttributes) < 0 || rpc_write(0, attributes, sizeof(struct cudaPointerAttributes)) < 0 || @@ -9776,11 +13992,16 @@ cudaError_t cudaPointerGetAttributes(struct cudaPointerAttributes* attributes, c rpc_read(0, attributes, sizeof(struct cudaPointerAttributes)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)attributes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)ptr, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) { + maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceCanAccessPeer) < 0 || rpc_write(0, canAccessPeer, sizeof(int)) < 0 || @@ -9790,11 +14011,16 @@ cudaError_t cudaDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevi rpc_read(0, canAccessPeer, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)canAccessPeer, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceEnablePeerAccess) < 0 || rpc_write(0, &peerDevice, sizeof(int)) < 0 || @@ -9802,33 +14028,41 @@ cudaError_t cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceDisablePeerAccess(int peerDevice) { + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceDisablePeerAccess) < 0 || rpc_write(0, &peerDevice, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&peerDevice, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) { + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsUnregisterResource) < 0 || rpc_write(0, &resource, sizeof(cudaGraphicsResource_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsResourceSetMapFlags) < 0 || rpc_write(0, &resource, sizeof(cudaGraphicsResource_t)) < 0 || @@ -9836,11 +14070,16 @@ cudaError_t cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, uns rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsMapResources(int count, cudaGraphicsResource_t* resources, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsMapResources) < 0 || rpc_write(0, &count, sizeof(int)) < 0 || @@ -9850,11 +14089,17 @@ cudaError_t cudaGraphicsMapResources(int count, cudaGraphicsResource_t* resource rpc_read(0, resources, sizeof(cudaGraphicsResource_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t* resources, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsUnmapResources) < 0 || rpc_write(0, &count, sizeof(int)) < 0 || @@ -9864,11 +14109,17 @@ cudaError_t cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t* resour rpc_read(0, resources, sizeof(cudaGraphicsResource_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resources, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsResourceGetMappedPointer(void** devPtr, size_t* size, cudaGraphicsResource_t resource) { + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsResourceGetMappedPointer) < 0 || rpc_write(0, devPtr, sizeof(void*)) < 0 || @@ -9879,11 +14130,18 @@ cudaError_t cudaGraphicsResourceGetMappedPointer(void** devPtr, size_t* size, cu rpc_read(0, size, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)devPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsSubResourceGetMappedArray(cudaArray_t* array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) { + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsSubResourceGetMappedArray) < 0 || rpc_write(0, array, sizeof(cudaArray_t)) < 0 || @@ -9894,11 +14152,17 @@ cudaError_t cudaGraphicsSubResourceGetMappedArray(cudaArray_t* array, cudaGraphi rpc_read(0, array, sizeof(cudaArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)array, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&arrayIndex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mipLevel, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t* mipmappedArray, cudaGraphicsResource_t resource) { + maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphicsResourceGetMappedMipmappedArray) < 0 || rpc_write(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 || @@ -9907,11 +14171,15 @@ cudaError_t cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t* mi rpc_read(0, mipmappedArray, sizeof(cudaMipmappedArray_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)mipmappedArray, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&resource, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc* desc, cudaArray_const_t array) { + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetChannelDesc) < 0 || rpc_write(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 || @@ -9920,11 +14188,17 @@ cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc* desc, cudaArray_con rpc_read(0, desc, sizeof(struct cudaChannelFormatDesc)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)desc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&array, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject, const struct cudaResourceDesc* pResDesc, const struct cudaTextureDesc* pTexDesc, const struct cudaResourceViewDesc* pResViewDesc) { + maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaCreateTextureObject) < 0 || rpc_write(0, pTexObject, sizeof(cudaTextureObject_t)) < 0 || @@ -9935,22 +14209,30 @@ cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject, const struc rpc_read(0, pTexObject, sizeof(cudaTextureObject_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pTexObject, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDestroyTextureObject(cudaTextureObject_t texObject) { + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDestroyTextureObject) < 0 || rpc_write(0, &texObject, sizeof(cudaTextureObject_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetTextureObjectResourceDesc(struct cudaResourceDesc* pResDesc, cudaTextureObject_t texObject) { + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetTextureObjectResourceDesc) < 0 || rpc_write(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 || @@ -9959,11 +14241,15 @@ cudaError_t cudaGetTextureObjectResourceDesc(struct cudaResourceDesc* pResDesc, rpc_read(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetTextureObjectTextureDesc(struct cudaTextureDesc* pTexDesc, cudaTextureObject_t texObject) { + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetTextureObjectTextureDesc) < 0 || rpc_write(0, pTexDesc, sizeof(struct cudaTextureDesc)) < 0 || @@ -9972,11 +14258,15 @@ cudaError_t cudaGetTextureObjectTextureDesc(struct cudaTextureDesc* pTexDesc, cu rpc_read(0, pTexDesc, sizeof(struct cudaTextureDesc)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pTexDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc* pResViewDesc, cudaTextureObject_t texObject) { + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetTextureObjectResourceViewDesc) < 0 || rpc_write(0, pResViewDesc, sizeof(struct cudaResourceViewDesc)) < 0 || @@ -9985,11 +14275,15 @@ cudaError_t cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc* pR rpc_read(0, pResViewDesc, sizeof(struct cudaResourceViewDesc)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pResViewDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&texObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, const struct cudaResourceDesc* pResDesc) { + maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaCreateSurfaceObject) < 0 || rpc_write(0, pSurfObject, sizeof(cudaSurfaceObject_t)) < 0 || @@ -9998,22 +14292,28 @@ cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, const stru rpc_read(0, pSurfObject, sizeof(cudaSurfaceObject_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pSurfObject, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) { + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDestroySurfaceObject) < 0 || rpc_write(0, &surfObject, sizeof(cudaSurfaceObject_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc* pResDesc, cudaSurfaceObject_t surfObject) { + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetSurfaceObjectResourceDesc) < 0 || rpc_write(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 || @@ -10022,11 +14322,14 @@ cudaError_t cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc* pResDesc, rpc_read(0, pResDesc, sizeof(struct cudaResourceDesc)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pResDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&surfObject, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDriverGetVersion(int* driverVersion) { + maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDriverGetVersion) < 0 || rpc_write(0, driverVersion, sizeof(int)) < 0 || @@ -10034,11 +14337,13 @@ cudaError_t cudaDriverGetVersion(int* driverVersion) rpc_read(0, driverVersion, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)driverVersion, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) { + maybe_copy_unified_arg(0, (void*)runtimeVersion, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaRuntimeGetVersion) < 0 || rpc_write(0, runtimeVersion, sizeof(int)) < 0 || @@ -10046,11 +14351,14 @@ cudaError_t cudaRuntimeGetVersion(int* runtimeVersion) rpc_read(0, runtimeVersion, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)runtimeVersion, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphCreate) < 0 || rpc_write(0, pGraph, sizeof(cudaGraph_t)) < 0 || @@ -10059,11 +14367,18 @@ cudaError_t cudaGraphCreate(cudaGraph_t* pGraph, unsigned int flags) rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddKernelNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaKernelNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddKernelNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10075,11 +14390,18 @@ cudaError_t cudaGraphAddKernelNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphKernelNodeGetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10088,11 +14410,15 @@ cudaError_t cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernel rpc_read(0, pNodeParams, sizeof(struct cudaKernelNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphKernelNodeSetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10100,11 +14426,15 @@ cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cuda rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) { + maybe_copy_unified_arg(0, (void*)&hSrc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hDst, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphKernelNodeCopyAttributes) < 0 || rpc_write(0, &hSrc, sizeof(cudaGraphNode_t)) < 0 || @@ -10112,11 +14442,16 @@ cudaError_t cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNod rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hSrc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hDst, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphKernelNodeGetAttribute(cudaGraphNode_t hNode, cudaLaunchAttributeID attr, cudaLaunchAttributeValue* value_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphKernelNodeGetAttribute) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10126,11 +14461,17 @@ cudaError_t cudaGraphKernelNodeGetAttribute(cudaGraphNode_t hNode, cudaLaunchAtt rpc_read(0, value_out, sizeof(cudaLaunchAttributeValue)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphKernelNodeSetAttribute(cudaGraphNode_t hNode, cudaLaunchAttributeID attr, const cudaLaunchAttributeValue* value) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphKernelNodeSetAttribute) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10139,11 +14480,19 @@ cudaError_t cudaGraphKernelNodeSetAttribute(cudaGraphNode_t hNode, cudaLaunchAtt rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddMemcpyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms* pCopyParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pCopyParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddMemcpyNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10155,11 +14504,25 @@ cudaError_t cudaGraphAddMemcpyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pCopyParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddMemcpyNodeToSymbol(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddMemcpyNodeToSymbol) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10175,11 +14538,22 @@ cudaError_t cudaGraphAddMemcpyNodeToSymbol(cudaGraphNode_t* pGraphNode, cudaGrap rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeGetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10188,11 +14562,15 @@ cudaError_t cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy rpc_read(0, pNodeParams, sizeof(struct cudaMemcpy3DParms)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeSetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10200,11 +14578,19 @@ cudaError_t cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cuda rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemcpyNodeSetParamsToSymbol) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10216,11 +14602,22 @@ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(cudaGraphNode_t node, const voi rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddMemsetNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaMemsetParams* pMemsetParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pMemsetParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddMemsetNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10232,11 +14629,18 @@ cudaError_t cudaGraphAddMemsetNode(cudaGraphNode_t* pGraphNode, cudaGraph_t grap rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pMemsetParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemsetNodeGetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10245,11 +14649,15 @@ cudaError_t cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemset rpc_read(0, pNodeParams, sizeof(struct cudaMemsetParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemsetNodeSetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10257,11 +14665,18 @@ cudaError_t cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cuda rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddHostNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaHostNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddHostNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10273,11 +14688,18 @@ cudaError_t cudaGraphAddHostNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphHostNodeGetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10286,11 +14708,15 @@ cudaError_t cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNode rpc_read(0, pNodeParams, sizeof(struct cudaHostNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphHostNodeSetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10298,11 +14724,18 @@ cudaError_t cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHo rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddChildGraphNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaGraph_t childGraph) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddChildGraphNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10314,11 +14747,18 @@ cudaError_t cudaGraphAddChildGraphNode(cudaGraphNode_t* pGraphNode, cudaGraph_t rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t* pGraph) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphChildGraphNodeGetGraph) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10327,11 +14767,17 @@ cudaError_t cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t* p rpc_read(0, pGraph, sizeof(cudaGraph_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddEmptyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddEmptyNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10342,11 +14788,20 @@ cudaError_t cudaGraphAddEmptyNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddEventRecordNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddEventRecordNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10358,11 +14813,18 @@ cudaError_t cudaGraphAddEventRecordNode(cudaGraphNode_t* pGraphNode, cudaGraph_t rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* event_out) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphEventRecordNodeGetEvent) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10371,11 +14833,15 @@ cudaError_t cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* rpc_read(0, event_out, sizeof(cudaEvent_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphEventRecordNodeSetEvent) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10383,11 +14849,18 @@ cudaError_t cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t e rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddEventWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddEventWaitNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10399,11 +14872,18 @@ cudaError_t cudaGraphAddEventWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t g rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* event_out) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphEventWaitNodeGetEvent) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10412,11 +14892,15 @@ cudaError_t cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t* ev rpc_read(0, event_out, sizeof(cudaEvent_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)event_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphEventWaitNodeSetEvent) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10424,11 +14908,18 @@ cudaError_t cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t eve rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddExternalSemaphoresSignalNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10440,11 +14931,18 @@ cudaError_t cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t* pGraphNode rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams* params_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresSignalNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10453,11 +14951,15 @@ cudaError_t cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode rpc_read(0, params_out, sizeof(struct cudaExternalSemaphoreSignalNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresSignalNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10465,11 +14967,18 @@ cudaError_t cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddExternalSemaphoresWaitNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10481,11 +14990,18 @@ cudaError_t cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t* pGraphNode, rpc_read(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams* params_out) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresWaitNodeGetParams) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10494,11 +15010,15 @@ cudaError_t cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, rpc_read(0, params_out, sizeof(struct cudaExternalSemaphoreWaitNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExternalSemaphoresWaitNodeSetParams) < 0 || rpc_write(0, &hNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10506,11 +15026,18 @@ cudaError_t cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddMemAllocNode(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddMemAllocNode) < 0 || rpc_write(0, pGraphNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10523,11 +15050,18 @@ cudaError_t cudaGraphAddMemAllocNode(cudaGraphNode_t* pGraphNode, cudaGraph_t gr rpc_read(0, nodeParams, sizeof(struct cudaMemAllocNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams* params_out) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphMemAllocNodeGetParams) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10536,22 +15070,28 @@ cudaError_t cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemA rpc_read(0, params_out, sizeof(struct cudaMemAllocNodeParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)params_out, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaDeviceGraphMemTrim(int device) { + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaDeviceGraphMemTrim) < 0 || rpc_write(0, &device, sizeof(int)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&device, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphClone(cudaGraph_t* pGraphClone, cudaGraph_t originalGraph) { + maybe_copy_unified_arg(0, (void*)pGraphClone, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphClone) < 0 || rpc_write(0, pGraphClone, sizeof(cudaGraph_t)) < 0 || @@ -10560,11 +15100,16 @@ cudaError_t cudaGraphClone(cudaGraph_t* pGraphClone, cudaGraph_t originalGraph) rpc_read(0, pGraphClone, sizeof(cudaGraph_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphClone, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&originalGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeFindInClone(cudaGraphNode_t* pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph) { + maybe_copy_unified_arg(0, (void*)pNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&originalNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&clonedGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeFindInClone) < 0 || rpc_write(0, pNode, sizeof(cudaGraphNode_t)) < 0 || @@ -10574,11 +15119,16 @@ cudaError_t cudaGraphNodeFindInClone(cudaGraphNode_t* pNode, cudaGraphNode_t ori rpc_read(0, pNode, sizeof(cudaGraphNode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&originalNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&clonedGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType* pType) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pType, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeGetType) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10587,11 +15137,16 @@ cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType* p rpc_read(0, pType, sizeof(enum cudaGraphNodeType)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pType, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t* nodes, size_t* numNodes) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphGetNodes) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10602,11 +15157,17 @@ cudaError_t cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t* nodes, size_t* rpc_read(0, numNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numNodes, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRootNodes, size_t* pNumRootNodes) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pRootNodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNumRootNodes, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphGetRootNodes) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10617,11 +15178,18 @@ cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t* pRootNodes rpc_read(0, pNumRootNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pRootNodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNumRootNodes, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to, size_t* numEdges) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphGetEdges) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10634,11 +15202,18 @@ cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGrap rpc_read(0, numEdges, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)numEdges, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* pDependencies, size_t* pNumDependencies) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNumDependencies, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeGetDependencies) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10649,11 +15224,17 @@ cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t* rpc_read(0, pNumDependencies, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependencies, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNumDependencies, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t* pDependentNodes, size_t* pNumDependentNodes) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pDependentNodes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNumDependentNodes, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeGetDependentNodes) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || @@ -10664,11 +15245,18 @@ cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t rpc_read(0, pNumDependentNodes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pDependentNodes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNumDependentNodes, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphAddDependencies) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10678,11 +15266,19 @@ cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t* f rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, size_t numDependencies) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphRemoveDependencies) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10692,22 +15288,31 @@ cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)from, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)to, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&numDependencies, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphDestroyNode(cudaGraphNode_t node) { + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphDestroyNode) < 0 || rpc_write(0, &node, sizeof(cudaGraphNode_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphInstantiate(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphInstantiate) < 0 || rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10717,11 +15322,17 @@ cudaError_t cudaGraphInstantiate(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, rpc_read(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphInstantiateWithFlags(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, unsigned long long flags) { + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphInstantiateWithFlags) < 0 || rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10731,11 +15342,17 @@ cudaError_t cudaGraphInstantiateWithFlags(cudaGraphExec_t* pGraphExec, cudaGraph rpc_read(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphInstantiateWithParams(cudaGraphExec_t* pGraphExec, cudaGraph_t graph, cudaGraphInstantiateParams* instantiateParams) { + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphInstantiateWithParams) < 0 || rpc_write(0, pGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10746,11 +15363,16 @@ cudaError_t cudaGraphInstantiateWithParams(cudaGraphExec_t* pGraphExec, cudaGrap rpc_read(0, instantiateParams, sizeof(cudaGraphInstantiateParams)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)pGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)instantiateParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long* flags) { + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecGetFlags) < 0 || rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10759,11 +15381,16 @@ cudaError_t cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long* rpc_read(0, flags, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecKernelNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10772,11 +15399,17 @@ cudaError_t cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecMemcpyNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10785,11 +15418,21 @@ cudaError_t cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecMemcpyNodeSetParamsToSymbol) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10802,11 +15445,21 @@ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(cudaGraphExec_t hGraphExec, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)src, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&offset, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kind, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecMemsetNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10815,11 +15468,17 @@ cudaError_t cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGra rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams* pNodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecHostNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10828,11 +15487,17 @@ cudaError_t cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraph rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pNodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecChildGraphNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10841,11 +15506,17 @@ cudaError_t cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cud rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&node, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&childGraph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecEventRecordNodeSetEvent) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10854,11 +15525,17 @@ cudaError_t cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cud rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecEventWaitNodeSetEvent) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10867,11 +15544,17 @@ cudaError_t cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaG rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&event, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecExternalSemaphoresSignalNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10880,11 +15563,17 @@ cudaError_t cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t h rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams* nodeParams) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecExternalSemaphoresWaitNodeSetParams) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10893,11 +15582,17 @@ cudaError_t cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nodeParams, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeSetEnabled) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10906,11 +15601,17 @@ cudaError_t cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&isEnabled, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int* isEnabled) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphNodeGetEnabled) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10920,11 +15621,17 @@ cudaError_t cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t rpc_read(0, isEnabled, sizeof(unsigned int)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hNode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)isEnabled, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphExecUpdateResultInfo* resultInfo) { + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecUpdate) < 0 || rpc_write(0, &hGraphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10934,11 +15641,16 @@ cudaError_t cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, rpc_read(0, resultInfo, sizeof(cudaGraphExecUpdateResultInfo)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&hGraphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hGraph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)resultInfo, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphUpload) < 0 || rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10946,11 +15658,15 @@ cudaError_t cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) { + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphLaunch) < 0 || rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 || @@ -10958,33 +15674,42 @@ cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stream, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphExecDestroy(cudaGraphExec_t graphExec) { + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphExecDestroy) < 0 || rpc_write(0, &graphExec, sizeof(cudaGraphExec_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graphExec, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphDestroy(cudaGraph_t graph) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphDestroy) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphDebugDotPrint(cudaGraph_t graph, const char* path, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphDebugDotPrint) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -10993,11 +15718,16 @@ cudaError_t cudaGraphDebugDotPrint(cudaGraph_t graph, const char* path, unsigned rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)path, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaUserObjectRetain(cudaUserObject_t object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaUserObjectRetain) < 0 || rpc_write(0, &object, sizeof(cudaUserObject_t)) < 0 || @@ -11005,11 +15735,15 @@ cudaError_t cudaUserObjectRetain(cudaUserObject_t object, unsigned int count) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaUserObjectRelease(cudaUserObject_t object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaUserObjectRelease) < 0 || rpc_write(0, &object, sizeof(cudaUserObject_t)) < 0 || @@ -11017,11 +15751,17 @@ cudaError_t cudaUserObjectRelease(cudaUserObject_t object, unsigned int count) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count, unsigned int flags) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphRetainUserObject) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -11031,11 +15771,18 @@ cudaError_t cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count) { + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGraphReleaseUserObject) < 0 || rpc_write(0, &graph, sizeof(cudaGraph_t)) < 0 || @@ -11044,11 +15791,18 @@ cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t objec rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&object, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&count, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult* driverStatus) { + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)funcPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)driverStatus, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetDriverEntryPoint) < 0 || rpc_write(0, &symbol, sizeof(const char*)) < 0 || @@ -11060,11 +15814,17 @@ cudaError_t cudaGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned rpc_read(0, driverStatus, sizeof(enum cudaDriverEntryPointQueryResult)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)symbol, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)funcPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&flags, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)driverStatus, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pExportTableId) { + maybe_copy_unified_arg(0, (void*)ppExportTable, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)pExportTableId, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetExportTable) < 0 || rpc_write(0, ppExportTable, sizeof(const void*)) < 0 || @@ -11073,11 +15833,15 @@ cudaError_t cudaGetExportTable(const void** ppExportTable, const cudaUUID_t* pEx rpc_read(0, ppExportTable, sizeof(const void*)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)ppExportTable, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)pExportTableId, cudaMemcpyDeviceToHost); return return_value; } cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr) { + maybe_copy_unified_arg(0, (void*)functionPtr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)symbolPtr, cudaMemcpyHostToDevice); cudaError_t return_value; if (rpc_start_request(0, RPC_cudaGetFuncBySymbol) < 0 || rpc_write(0, functionPtr, sizeof(cudaFunction_t)) < 0 || @@ -11086,33 +15850,41 @@ cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolP rpc_read(0, functionPtr, sizeof(cudaFunction_t)) < 0 || rpc_end_response(0, &return_value) < 0) return cudaErrorDevicesUnavailable; + maybe_copy_unified_arg(0, (void*)functionPtr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)symbolPtr, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCreate_v2(cublasHandle_t* handle) { + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCreate_v2) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, handle, sizeof(cublasHandle_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDestroy_v2(cublasHandle_t handle) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDestroy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetVersion_v2(cublasHandle_t handle, int* version) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetVersion_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11121,11 +15893,15 @@ cublasStatus_t cublasGetVersion_v2(cublasHandle_t handle, int* version) rpc_read(0, version, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)version, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetProperty(libraryPropertyType type, int* value) { + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetProperty) < 0 || rpc_write(0, &type, sizeof(libraryPropertyType)) < 0 || @@ -11134,11 +15910,15 @@ cublasStatus_t cublasGetProperty(libraryPropertyType type, int* value) rpc_read(0, value, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetStream_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11146,11 +15926,15 @@ cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetStream_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11159,11 +15943,15 @@ cublasStatus_t cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId) rpc_read(0, streamId, sizeof(cudaStream_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetPointerMode_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11172,11 +15960,15 @@ cublasStatus_t cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_ rpc_read(0, mode, sizeof(cublasPointerMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetPointerMode_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11184,11 +15976,15 @@ cublasStatus_t cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_ rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetAtomicsMode) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11197,11 +15993,15 @@ cublasStatus_t cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* rpc_read(0, mode, sizeof(cublasAtomicsMode_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetAtomicsMode) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11209,11 +16009,15 @@ cublasStatus_t cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t m rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetMathMode) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11222,11 +16026,15 @@ cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode) rpc_read(0, mode, sizeof(cublasMath_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetMathMode) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11234,11 +16042,15 @@ cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)smCountTarget, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetSmCountTarget) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11247,11 +16059,15 @@ cublasStatus_t cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget) rpc_read(0, smCountTarget, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)smCountTarget, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&smCountTarget, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetSmCountTarget) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11259,11 +16075,17 @@ cublasStatus_t cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&smCountTarget, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdErr, const char* logFileName) { + maybe_copy_unified_arg(0, (void*)&logIsOn, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&logToStdOut, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&logToStdErr, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)logFileName, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasLoggerConfigure) < 0 || rpc_write(0, &logIsOn, sizeof(int)) < 0 || @@ -11273,22 +16095,29 @@ cublasStatus_t cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdE rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&logIsOn, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&logToStdOut, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&logToStdErr, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)logFileName, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSetLoggerCallback(cublasLogCallback userCallback) { + maybe_copy_unified_arg(0, (void*)&userCallback, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSetLoggerCallback) < 0 || rpc_write(0, &userCallback, sizeof(cublasLogCallback)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&userCallback, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGetLoggerCallback(cublasLogCallback* userCallback) { + maybe_copy_unified_arg(0, (void*)userCallback, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGetLoggerCallback) < 0 || rpc_write(0, userCallback, sizeof(cublasLogCallback)) < 0 || @@ -11296,11 +16125,17 @@ cublasStatus_t cublasGetLoggerCallback(cublasLogCallback* userCallback) rpc_read(0, userCallback, sizeof(cublasLogCallback)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)userCallback, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSnrm2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11312,11 +16147,21 @@ cublasStatus_t cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSnrm2_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSnrm2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11328,11 +16173,21 @@ cublasStatus_t cublasSnrm2_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDnrm2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11344,11 +16199,21 @@ cublasStatus_t cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDnrm2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11360,11 +16225,21 @@ cublasStatus_t cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScnrm2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11376,11 +16251,21 @@ cublasStatus_t cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScnrm2_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScnrm2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11392,11 +16277,21 @@ cublasStatus_t cublasScnrm2_v2_64(cublasHandle_t handle, int64_t n, const cuComp rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDznrm2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11408,11 +16303,21 @@ cublasStatus_t cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleCompl rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDznrm2_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDznrm2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11424,11 +16329,23 @@ cublasStatus_t cublasDznrm2_v2_64(cublasHandle_t handle, int64_t n, const cuDoub rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSdot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11442,11 +16359,25 @@ cublasStatus_t cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int i rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSdot_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, const float* y, int64_t incy, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSdot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11460,11 +16391,25 @@ cublasStatus_t cublasSdot_v2_64(cublasHandle_t handle, int64_t n, const float* x rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDdot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11478,11 +16423,25 @@ cublasStatus_t cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, const double* y, int64_t incy, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDdot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11496,11 +16455,25 @@ cublasStatus_t cublasDdot_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdotu_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdotu_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11514,11 +16487,25 @@ cublasStatus_t cublasCdotu_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdotu_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdotu_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11532,11 +16519,25 @@ cublasStatus_t cublasCdotu_v2_64(cublasHandle_t handle, int64_t n, const cuCompl rpc_read(0, result, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdotc_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdotc_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11550,11 +16551,25 @@ cublasStatus_t cublasCdotc_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdotc_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdotc_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11568,11 +16583,25 @@ cublasStatus_t cublasCdotc_v2_64(cublasHandle_t handle, int64_t n, const cuCompl rpc_read(0, result, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdotu_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdotu_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11586,11 +16615,25 @@ cublasStatus_t cublasZdotu_v2(cublasHandle_t handle, int n, const cuDoubleComple rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdotu_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdotu_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11604,11 +16647,25 @@ cublasStatus_t cublasZdotu_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdotc_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdotc_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11622,11 +16679,25 @@ cublasStatus_t cublasZdotc_v2(cublasHandle_t handle, int n, const cuDoubleComple rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdotc_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdotc_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11640,11 +16711,23 @@ cublasStatus_t cublasZdotc_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl rpc_read(0, result, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11656,11 +16739,21 @@ cublasStatus_t cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha, rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11672,11 +16765,21 @@ cublasStatus_t cublasSscal_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11688,11 +16791,21 @@ cublasStatus_t cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11704,11 +16817,21 @@ cublasStatus_t cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alpha, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11720,11 +16843,21 @@ cublasStatus_t cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alp rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCscal_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* alpha, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11736,11 +16869,21 @@ cublasStatus_t cublasCscal_v2_64(cublasHandle_t handle, int64_t n, const cuCompl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11752,11 +16895,21 @@ cublasStatus_t cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha, rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11768,11 +16921,21 @@ cublasStatus_t cublasCsscal_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11784,11 +16947,21 @@ cublasStatus_t cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComple rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZscal_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11800,11 +16973,21 @@ cublasStatus_t cublasZscal_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdscal_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11816,11 +16999,21 @@ cublasStatus_t cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdscal_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11832,11 +17025,23 @@ cublasStatus_t cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSaxpy_v2(cublasHandle_t handle, int n, const float* alpha, const float* x, int incx, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSaxpy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11850,11 +17055,25 @@ cublasStatus_t cublasSaxpy_v2(cublasHandle_t handle, int n, const float* alpha, rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSaxpy_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, const float* x, int64_t incx, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSaxpy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11868,11 +17087,25 @@ cublasStatus_t cublasSaxpy_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha, const double* x, int incx, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDaxpy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11886,11 +17119,25 @@ cublasStatus_t cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, const double* x, int64_t incx, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDaxpy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11904,11 +17151,25 @@ cublasStatus_t cublasDaxpy_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCaxpy_v2(cublasHandle_t handle, int n, const cuComplex* alpha, const cuComplex* x, int incx, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCaxpy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11922,11 +17183,25 @@ cublasStatus_t cublasCaxpy_v2(cublasHandle_t handle, int n, const cuComplex* alp rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCaxpy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11940,11 +17215,25 @@ cublasStatus_t cublasCaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuCompl rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZaxpy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZaxpy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11958,11 +17247,25 @@ cublasStatus_t cublasZaxpy_v2(cublasHandle_t handle, int n, const cuDoubleComple rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZaxpy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11976,11 +17279,24 @@ cublasStatus_t cublasZaxpy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScopy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -11993,11 +17309,23 @@ cublasStatus_t cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScopy_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScopy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12010,11 +17338,23 @@ cublasStatus_t cublasScopy_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDcopy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12027,11 +17367,23 @@ cublasStatus_t cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDcopy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12044,11 +17396,23 @@ cublasStatus_t cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCcopy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12061,11 +17425,23 @@ cublasStatus_t cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCcopy_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCcopy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12078,11 +17454,23 @@ cublasStatus_t cublasCcopy_v2_64(cublasHandle_t handle, int64_t n, const cuCompl rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZcopy_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12095,11 +17483,23 @@ cublasStatus_t cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComple rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZcopy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZcopy_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12112,11 +17512,23 @@ cublasStatus_t cublasZcopy_v2_64(cublasHandle_t handle, int64_t n, const cuDoubl rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSswap_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12130,11 +17542,23 @@ cublasStatus_t cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSswap_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSswap_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12148,11 +17572,23 @@ cublasStatus_t cublasSswap_v2_64(cublasHandle_t handle, int64_t n, float* x, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDswap_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12166,11 +17602,23 @@ cublasStatus_t cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDswap_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDswap_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12184,11 +17632,23 @@ cublasStatus_t cublasDswap_v2_64(cublasHandle_t handle, int64_t n, double* x, in rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCswap_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12202,11 +17662,23 @@ cublasStatus_t cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int in rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCswap_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCswap_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12220,11 +17692,23 @@ cublasStatus_t cublasCswap_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZswap_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12238,11 +17722,23 @@ cublasStatus_t cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZswap_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZswap_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12256,11 +17752,22 @@ cublasStatus_t cublasZswap_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIsamax_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12272,11 +17779,21 @@ cublasStatus_t cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIsamax_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIsamax_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12288,11 +17805,21 @@ cublasStatus_t cublasIsamax_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIdamax_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12304,11 +17831,21 @@ cublasStatus_t cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, in rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIdamax_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12320,11 +17857,21 @@ cublasStatus_t cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIcamax_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12336,11 +17883,21 @@ cublasStatus_t cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIcamax_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIcamax_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12352,11 +17909,21 @@ cublasStatus_t cublasIcamax_v2_64(cublasHandle_t handle, int64_t n, const cuComp rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIzamax_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIzamax_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12368,11 +17935,21 @@ cublasStatus_t cublasIzamax_v2(cublasHandle_t handle, int n, const cuDoubleCompl rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIzamax_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIzamax_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12384,11 +17961,22 @@ cublasStatus_t cublasIzamax_v2_64(cublasHandle_t handle, int64_t n, const cuDoub rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIamaxEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIamaxEx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12401,11 +17989,23 @@ cublasStatus_t cublasIamaxEx(cublasHandle_t handle, int n, const void* x, cudaDa rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIamaxEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIamaxEx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12418,11 +18018,22 @@ cublasStatus_t cublasIamaxEx_64(cublasHandle_t handle, int64_t n, const void* x, rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIsamin_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12434,11 +18045,21 @@ cublasStatus_t cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIsamin_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIsamin_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12450,11 +18071,21 @@ cublasStatus_t cublasIsamin_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIdamin_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12466,11 +18097,21 @@ cublasStatus_t cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, in rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIdamin_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12482,11 +18123,21 @@ cublasStatus_t cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIcamin_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12498,11 +18149,21 @@ cublasStatus_t cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIcamin_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIcamin_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12514,11 +18175,21 @@ cublasStatus_t cublasIcamin_v2_64(cublasHandle_t handle, int64_t n, const cuComp rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIzamin_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIzamin_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12530,11 +18201,21 @@ cublasStatus_t cublasIzamin_v2(cublasHandle_t handle, int n, const cuDoubleCompl rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIzamin_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIzamin_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12546,11 +18227,22 @@ cublasStatus_t cublasIzamin_v2_64(cublasHandle_t handle, int64_t n, const cuDoub rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIaminEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIaminEx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12563,11 +18255,23 @@ cublasStatus_t cublasIaminEx(cublasHandle_t handle, int n, const void* x, cudaDa rpc_read(0, result, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasIaminEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasIaminEx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12580,11 +18284,22 @@ cublasStatus_t cublasIaminEx_64(cublasHandle_t handle, int64_t n, const void* x, rpc_read(0, result, sizeof(int64_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSasum_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12596,11 +18311,21 @@ cublasStatus_t cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSasum_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSasum_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12612,11 +18337,21 @@ cublasStatus_t cublasSasum_v2_64(cublasHandle_t handle, int64_t n, const float* rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDasum_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12628,11 +18363,21 @@ cublasStatus_t cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDasum_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12644,11 +18389,21 @@ cublasStatus_t cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double* rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScasum_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12660,11 +18415,21 @@ cublasStatus_t cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasScasum_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasScasum_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12676,11 +18441,21 @@ cublasStatus_t cublasScasum_v2_64(cublasHandle_t handle, int64_t n, const cuComp rpc_read(0, result, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDzasum_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDzasum_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12692,11 +18467,21 @@ cublasStatus_t cublasDzasum_v2(cublasHandle_t handle, int n, const cuDoubleCompl rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDzasum_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDzasum_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12708,11 +18493,24 @@ cublasStatus_t cublasDzasum_v2_64(cublasHandle_t handle, int64_t n, const cuDoub rpc_read(0, result, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)result, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrot_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* c, const float* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12728,11 +18526,27 @@ cublasStatus_t cublasSrot_v2(cublasHandle_t handle, int n, float* x, int incx, f rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrot_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* c, const float* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12748,11 +18562,27 @@ cublasStatus_t cublasSrot_v2_64(cublasHandle_t handle, int64_t n, float* x, int6 rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* c, const double* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12768,11 +18598,27 @@ cublasStatus_t cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy, const double* c, const double* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12788,11 +18634,27 @@ cublasStatus_t cublasDrot_v2_64(cublasHandle_t handle, int64_t n, double* x, int rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCrot_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const cuComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12808,11 +18670,27 @@ cublasStatus_t cublasCrot_v2(cublasHandle_t handle, int n, cuComplex* x, int inc rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy, const float* c, const cuComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12828,11 +18706,27 @@ cublasStatus_t cublasCrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsrot_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const float* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12848,11 +18742,27 @@ cublasStatus_t cublasCsrot_v2(cublasHandle_t handle, int n, cuComplex* x, int in rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy, const float* c, const float* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12868,11 +18778,27 @@ cublasStatus_t cublasCsrot_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, const double* c, const cuDoubleComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12888,11 +18814,27 @@ cublasStatus_t cublasZrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, i rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy, const double* c, const cuDoubleComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12908,11 +18850,27 @@ cublasStatus_t cublasZrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComple rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, const double* c, const double* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdrot_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12928,11 +18886,27 @@ cublasStatus_t cublasZdrot_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy, const double* c, const double* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdrot_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12948,11 +18922,24 @@ cublasStatus_t cublasZdrot_v2_64(cublasHandle_t handle, int64_t n, cuDoubleCompl rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrotg_v2(cublasHandle_t handle, float* a, float* b, float* c, float* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrotg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12967,11 +18954,21 @@ cublasStatus_t cublasSrotg_v2(cublasHandle_t handle, float* a, float* b, float* rpc_read(0, s, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrotg_v2(cublasHandle_t handle, double* a, double* b, double* c, double* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrotg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -12986,11 +18983,21 @@ cublasStatus_t cublasDrotg_v2(cublasHandle_t handle, double* a, double* b, doubl rpc_read(0, s, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCrotg_v2(cublasHandle_t handle, cuComplex* a, cuComplex* b, float* c, cuComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCrotg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13005,11 +19012,21 @@ cublasStatus_t cublasCrotg_v2(cublasHandle_t handle, cuComplex* a, cuComplex* b, rpc_read(0, s, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZrotg_v2(cublasHandle_t handle, cuDoubleComplex* a, cuDoubleComplex* b, double* c, cuDoubleComplex* s) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZrotg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13024,11 +19041,23 @@ cublasStatus_t cublasZrotg_v2(cublasHandle_t handle, cuDoubleComplex* a, cuDoubl rpc_read(0, s, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)a, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)b, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)s, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrotm_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrotm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13043,11 +19072,25 @@ cublasStatus_t cublasSrotm_v2(cublasHandle_t handle, int n, float* x, int incx, rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrotm_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrotm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13062,11 +19105,25 @@ cublasStatus_t cublasSrotm_v2_64(cublasHandle_t handle, int64_t n, float* x, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrotm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13081,11 +19138,25 @@ cublasStatus_t cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy, const double* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrotm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13100,11 +19171,24 @@ cublasStatus_t cublasDrotm_v2_64(cublasHandle_t handle, int64_t n, double* x, in rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSrotmg_v2(cublasHandle_t handle, float* d1, float* d2, float* x1, const float* y1, float* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSrotmg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13120,11 +19204,23 @@ cublasStatus_t cublasSrotmg_v2(cublasHandle_t handle, float* d1, float* d2, floa rpc_read(0, param, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, double* x1, const double* y1, double* param) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDrotmg_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13140,11 +19236,29 @@ cublasStatus_t cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, do rpc_read(0, param, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)d1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)d2, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y1, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)param, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13163,11 +19277,35 @@ cublasStatus_t cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13186,11 +19324,35 @@ cublasStatus_t cublasSgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13209,11 +19371,35 @@ cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13232,11 +19418,35 @@ cublasStatus_t cublasDgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13255,11 +19465,35 @@ cublasStatus_t cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13278,11 +19512,35 @@ cublasStatus_t cublasCgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13301,11 +19559,35 @@ cublasStatus_t cublasZgemv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13324,11 +19606,37 @@ cublasStatus_t cublasZgemv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13349,11 +19657,39 @@ cublasStatus_t cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13374,11 +19710,39 @@ cublasStatus_t cublasSgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13399,11 +19763,39 @@ cublasStatus_t cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13424,11 +19816,39 @@ cublasStatus_t cublasDgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13449,11 +19869,39 @@ cublasStatus_t cublasCgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13474,11 +19922,39 @@ cublasStatus_t cublasCgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl, int ku, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13499,11 +19975,39 @@ cublasStatus_t cublasZgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, in rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, int64_t kl, int64_t ku, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13524,11 +20028,34 @@ cublasStatus_t cublasZgbmv_v2_64(cublasHandle_t handle, cublasOperation_t trans, rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&kl, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ku, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* A, int lda, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13544,11 +20071,29 @@ cublasStatus_t cublasStrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* A, int64_t lda, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13564,11 +20109,29 @@ cublasStatus_t cublasStrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* A, int lda, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13584,11 +20147,29 @@ cublasStatus_t cublasDtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* A, int64_t lda, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13604,11 +20185,29 @@ cublasStatus_t cublasDtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13624,11 +20223,29 @@ cublasStatus_t cublasCtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13644,11 +20261,29 @@ cublasStatus_t cublasCtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13664,11 +20299,29 @@ cublasStatus_t cublasZtrmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13684,11 +20337,30 @@ cublasStatus_t cublasZtrmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const float* A, int lda, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13705,11 +20377,31 @@ cublasStatus_t cublasStbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const float* A, int64_t lda, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13726,11 +20418,31 @@ cublasStatus_t cublasStbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const double* A, int lda, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13747,11 +20459,31 @@ cublasStatus_t cublasDtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const double* A, int64_t lda, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13768,11 +20500,31 @@ cublasStatus_t cublasDtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13789,11 +20541,31 @@ cublasStatus_t cublasCtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13810,11 +20582,31 @@ cublasStatus_t cublasCtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13831,11 +20623,31 @@ cublasStatus_t cublasZtbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13852,11 +20664,29 @@ cublasStatus_t cublasZtbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* AP, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13871,11 +20701,27 @@ cublasStatus_t cublasStpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* AP, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13890,11 +20736,27 @@ cublasStatus_t cublasStpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* AP, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13909,11 +20771,27 @@ cublasStatus_t cublasDtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* AP, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13928,11 +20806,27 @@ cublasStatus_t cublasDtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* AP, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13947,11 +20841,27 @@ cublasStatus_t cublasCtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* AP, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13966,11 +20876,27 @@ cublasStatus_t cublasCtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -13985,11 +20911,27 @@ cublasStatus_t cublasZtpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* AP, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14004,11 +20946,28 @@ cublasStatus_t cublasZtpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* A, int lda, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14024,11 +20983,29 @@ cublasStatus_t cublasStrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* A, int64_t lda, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14044,11 +21021,29 @@ cublasStatus_t cublasStrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* A, int lda, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14064,11 +21059,29 @@ cublasStatus_t cublasDtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* A, int64_t lda, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14084,11 +21097,29 @@ cublasStatus_t cublasDtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14104,11 +21135,29 @@ cublasStatus_t cublasCtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14124,11 +21173,29 @@ cublasStatus_t cublasCtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14144,11 +21211,29 @@ cublasStatus_t cublasZtrsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14164,11 +21249,28 @@ cublasStatus_t cublasZtrsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const float* AP, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStpsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14183,11 +21285,27 @@ cublasStatus_t cublasStpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const float* AP, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStpsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14202,11 +21320,27 @@ cublasStatus_t cublasStpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double* AP, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtpsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14221,11 +21355,27 @@ cublasStatus_t cublasDtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const double* AP, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtpsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14240,11 +21390,27 @@ cublasStatus_t cublasDtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuComplex* AP, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtpsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14259,11 +21425,27 @@ cublasStatus_t cublasCtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuComplex* AP, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtpsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14278,11 +21460,27 @@ cublasStatus_t cublasCtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtpsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14297,11 +21495,27 @@ cublasStatus_t cublasZtpsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, const cuDoubleComplex* AP, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtpsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14316,11 +21530,29 @@ cublasStatus_t cublasZtpsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const float* A, int lda, float* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStbsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14337,11 +21569,31 @@ cublasStatus_t cublasStbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const float* A, int64_t lda, float* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStbsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14358,11 +21610,31 @@ cublasStatus_t cublasStbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const double* A, int lda, double* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtbsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14379,11 +21651,31 @@ cublasStatus_t cublasDtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const double* A, int64_t lda, double* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtbsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14400,11 +21692,31 @@ cublasStatus_t cublasDtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtbsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14421,11 +21733,31 @@ cublasStatus_t cublasCtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuComplex* A, int64_t lda, cuComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtbsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14442,11 +21774,31 @@ cublasStatus_t cublasCtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtbsv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14463,11 +21815,31 @@ cublasStatus_t cublasZtbsv_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t n, int64_t k, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* x, int64_t incx) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtbsv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14484,11 +21856,32 @@ cublasStatus_t cublasZtbsv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, x, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsymv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14506,11 +21899,33 @@ cublasStatus_t cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsymv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14528,11 +21943,33 @@ cublasStatus_t cublasSsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsymv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14550,11 +21987,33 @@ cublasStatus_t cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsymv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14572,11 +22031,33 @@ cublasStatus_t cublasDsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsymv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14594,11 +22075,33 @@ cublasStatus_t cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsymv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14616,11 +22119,33 @@ cublasStatus_t cublasCsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsymv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14638,11 +22163,33 @@ cublasStatus_t cublasZsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsymv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14660,11 +22207,33 @@ cublasStatus_t cublasZsymv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14682,11 +22251,33 @@ cublasStatus_t cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14704,11 +22295,33 @@ cublasStatus_t cublasChemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhemv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14726,11 +22339,33 @@ cublasStatus_t cublasZhemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhemv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14748,11 +22383,34 @@ cublasStatus_t cublasZhemv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const float* alpha, const float* A, int lda, const float* x, int incx, const float* beta, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14771,11 +22429,35 @@ cublasStatus_t cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* x, int64_t incx, const float* beta, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14794,11 +22476,35 @@ cublasStatus_t cublasSsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const double* alpha, const double* A, int lda, const double* x, int incx, const double* beta, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14817,11 +22523,35 @@ cublasStatus_t cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* x, int64_t incx, const double* beta, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14840,11 +22570,35 @@ cublasStatus_t cublasDsbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14863,11 +22617,35 @@ cublasStatus_t cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14886,11 +22664,35 @@ cublasStatus_t cublasChbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhbmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14909,11 +22711,35 @@ cublasStatus_t cublasZhbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhbmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14932,11 +22758,33 @@ cublasStatus_t cublasZhbmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* AP, const float* x, int incx, const float* beta, float* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14953,11 +22801,31 @@ cublasStatus_t cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* AP, const float* x, int64_t incx, const float* beta, float* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14974,11 +22842,31 @@ cublasStatus_t cublasSspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* AP, const double* x, int incx, const double* beta, double* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -14995,11 +22883,31 @@ cublasStatus_t cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* AP, const double* x, int64_t incx, const double* beta, double* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15016,11 +22924,31 @@ cublasStatus_t cublasDspmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* AP, const cuComplex* x, int incx, const cuComplex* beta, cuComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15037,11 +22965,31 @@ cublasStatus_t cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* AP, const cuComplex* x, int64_t incx, const cuComplex* beta, cuComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15058,11 +23006,31 @@ cublasStatus_t cublasChpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* AP, const cuDoubleComplex* x, int incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpmv_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15079,11 +23047,31 @@ cublasStatus_t cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* AP, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpmv_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15100,11 +23088,31 @@ cublasStatus_t cublasZhpmv_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSger_v2(cublasHandle_t handle, int m, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSger_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15121,11 +23129,31 @@ cublasStatus_t cublasSger_v2(cublasHandle_t handle, int m, int n, const float* a rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSger_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15142,11 +23170,31 @@ cublasStatus_t cublasSger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDger_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15163,11 +23211,31 @@ cublasStatus_t cublasDger_v2(cublasHandle_t handle, int m, int n, const double* rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDger_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15184,11 +23252,31 @@ cublasStatus_t cublasDger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, con rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgeru_v2(cublasHandle_t handle, int m, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgeru_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15205,11 +23293,31 @@ cublasStatus_t cublasCgeru_v2(cublasHandle_t handle, int m, int n, const cuCompl rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgeru_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15226,11 +23334,31 @@ cublasStatus_t cublasCgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgerc_v2(cublasHandle_t handle, int m, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgerc_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15247,11 +23375,31 @@ cublasStatus_t cublasCgerc_v2(cublasHandle_t handle, int m, int n, const cuCompl rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgerc_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15268,11 +23416,31 @@ cublasStatus_t cublasCgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgeru_v2(cublasHandle_t handle, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgeru_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15289,11 +23457,31 @@ cublasStatus_t cublasZgeru_v2(cublasHandle_t handle, int m, int n, const cuDoubl rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgeru_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15310,11 +23498,31 @@ cublasStatus_t cublasZgeru_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgerc_v2(cublasHandle_t handle, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgerc_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15331,11 +23539,31 @@ cublasStatus_t cublasZgerc_v2(cublasHandle_t handle, int m, int n, const cuDoubl rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgerc_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15352,11 +23580,29 @@ cublasStatus_t cublasZgerc_v2_64(cublasHandle_t handle, int64_t m, int64_t n, co rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, float* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15371,11 +23617,27 @@ cublasStatus_t cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, float* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15390,11 +23652,27 @@ cublasStatus_t cublasSsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, double* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15409,11 +23687,27 @@ cublasStatus_t cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, double* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15428,11 +23722,27 @@ cublasStatus_t cublasDsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15447,11 +23757,27 @@ cublasStatus_t cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15466,11 +23792,27 @@ cublasStatus_t cublasCsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15485,11 +23827,27 @@ cublasStatus_t cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15504,11 +23862,27 @@ cublasStatus_t cublasZsyr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const cuComplex* x, int incx, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15523,11 +23897,27 @@ cublasStatus_t cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const cuComplex* x, int64_t incx, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15542,11 +23932,27 @@ cublasStatus_t cublasCher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15561,11 +23967,27 @@ cublasStatus_t cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15580,11 +24002,26 @@ cublasStatus_t cublasZher_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, float* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15598,11 +24035,25 @@ cublasStatus_t cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, float* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15616,11 +24067,25 @@ cublasStatus_t cublasSspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, double* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15634,11 +24099,25 @@ cublasStatus_t cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, double* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15652,11 +24131,25 @@ cublasStatus_t cublasDspr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const cuComplex* x, int incx, cuComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15670,11 +24163,25 @@ cublasStatus_t cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const cuComplex* x, int64_t incx, cuComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15688,11 +24195,25 @@ cublasStatus_t cublasChpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpr_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15706,11 +24227,25 @@ cublasStatus_t cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpr_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15724,11 +24259,28 @@ cublasStatus_t cublasZhpr_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, in rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15745,11 +24297,31 @@ cublasStatus_t cublasSsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15766,11 +24338,31 @@ cublasStatus_t cublasSsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15787,11 +24379,31 @@ cublasStatus_t cublasDsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15808,11 +24420,31 @@ cublasStatus_t cublasDsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15829,11 +24461,31 @@ cublasStatus_t cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15850,11 +24502,31 @@ cublasStatus_t cublasCsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15871,11 +24543,31 @@ cublasStatus_t cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15892,11 +24584,31 @@ cublasStatus_t cublasZsyr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15913,11 +24625,31 @@ cublasStatus_t cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15934,11 +24666,31 @@ cublasStatus_t cublasCher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15955,11 +24707,31 @@ cublasStatus_t cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* A, int64_t lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15976,11 +24748,30 @@ cublasStatus_t cublasZher2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, const float* y, int incy, float* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -15996,11 +24787,29 @@ cublasStatus_t cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSspr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16016,11 +24825,29 @@ cublasStatus_t cublasSspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, const double* y, int incy, double* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16036,11 +24863,29 @@ cublasStatus_t cublasDspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const double* alpha, const double* x, int64_t incx, const double* y, int64_t incy, double* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDspr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16056,11 +24901,29 @@ cublasStatus_t cublasDspr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16076,11 +24939,29 @@ cublasStatus_t cublasChpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuComplex* alpha, const cuComplex* x, int64_t incx, const cuComplex* y, int64_t incy, cuComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChpr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16096,11 +24977,29 @@ cublasStatus_t cublasChpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpr2_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16116,11 +25015,29 @@ cublasStatus_t cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* x, int64_t incx, const cuDoubleComplex* y, int64_t incy, cuDoubleComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhpr2_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16136,11 +25053,39 @@ cublasStatus_t cublasZhpr2_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, i rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* const Aarray[], int lda, const float* const xarray[], int incx, const float* beta, float* const yarray[], int incy, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemvBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -16159,11 +25104,49 @@ cublasStatus_t cublasSgemvBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* const Aarray[], int lda, const __nv_bfloat16* const xarray[], int incx, const float* beta, __nv_bfloat16* const yarray[], int incy, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasTSTgemvBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -16182,11 +25165,46 @@ cublasStatus_t cublasTSTgemvBatched(cublasHandle_t handle, cublasOperation_t tra rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)xarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)xarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)yarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)yarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* A, int lda, long long int strideA, const float* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16209,11 +25227,43 @@ cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16236,11 +25286,43 @@ cublasStatus_t cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double* alpha, const double* A, int lda, long long int strideA, const double* x, int incx, long long int stridex, const double* beta, double* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16263,11 +25345,43 @@ cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* x, int64_t incx, long long int stridex, const double* beta, double* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16290,11 +25404,43 @@ cublasStatus_t cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, y, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* x, int incx, long long int stridex, const cuComplex* beta, cuComplex* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16317,11 +25463,43 @@ cublasStatus_t cublasCgemvStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* x, int64_t incx, long long int stridex, const cuComplex* beta, cuComplex* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16344,11 +25522,43 @@ cublasStatus_t cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, y, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* x, int incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16371,11 +25581,43 @@ cublasStatus_t cublasZgemvStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* x, int64_t incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16398,11 +25640,43 @@ cublasStatus_t cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, y, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHSHgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __half* A, int lda, long long int strideA, const __half* x, int incx, long long int stridex, const float* beta, __half* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHSHgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16425,11 +25699,43 @@ cublasStatus_t cublasHSHgemvStridedBatched(cublasHandle_t handle, cublasOperatio rpc_read(0, y, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHSHgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, __half* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHSHgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16452,11 +25758,43 @@ cublasStatus_t cublasHSHgemvStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_read(0, y, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHSSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __half* A, int lda, long long int strideA, const __half* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHSSgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16479,11 +25817,43 @@ cublasStatus_t cublasHSSgemvStridedBatched(cublasHandle_t handle, cublasOperatio rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHSSgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16506,11 +25876,43 @@ cublasStatus_t cublasHSSgemvStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasTSTgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* A, int lda, long long int strideA, const __nv_bfloat16* x, int incx, long long int stridex, const float* beta, __nv_bfloat16* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasTSTgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16533,11 +25935,43 @@ cublasStatus_t cublasTSTgemvStridedBatched(cublasHandle_t handle, cublasOperatio rpc_read(0, y, sizeof(__nv_bfloat16)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasTSTgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, __nv_bfloat16* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasTSTgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16560,11 +25994,43 @@ cublasStatus_t cublasTSTgemvStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_read(0, y, sizeof(__nv_bfloat16)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasTSSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const __nv_bfloat16* A, int lda, long long int strideA, const __nv_bfloat16* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasTSSgemvStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16587,11 +26053,43 @@ cublasStatus_t cublasTSSgemvStridedBatched(cublasHandle_t handle, cublasOperatio rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasTSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasTSSgemvStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16614,11 +26112,41 @@ cublasStatus_t cublasTSSgemvStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_read(0, y, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridex, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incy, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&stridey, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16640,11 +26168,39 @@ cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16665,11 +26221,39 @@ cublasStatus_t cublasSgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16690,11 +26274,39 @@ cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16715,11 +26327,39 @@ cublasStatus_t cublasDgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16740,11 +26380,39 @@ cublasStatus_t cublasCgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16765,11 +26433,39 @@ cublasStatus_t cublasCgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3m(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3m) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16790,11 +26486,39 @@ cublasStatus_t cublasCgemm3m(cublasHandle_t handle, cublasOperation_t transa, cu rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3m_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16815,11 +26539,39 @@ cublasStatus_t cublasCgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemm_v2(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16840,11 +26592,39 @@ cublasStatus_t cublasZgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16865,11 +26645,39 @@ cublasStatus_t cublasZgemm_v2_64(cublasHandle_t handle, cublasOperation_t transa rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemm3m) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16890,11 +26698,39 @@ cublasStatus_t cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa, cu rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemm3m_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16915,11 +26751,39 @@ cublasStatus_t cublasZgemm3m_64(cublasHandle_t handle, cublasOperation_t transa, rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, const __half* B, int ldb, const __half* beta, __half* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemm) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16940,11 +26804,39 @@ cublasStatus_t cublasHgemm(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemm_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, const __half* B, int64_t ldb, const __half* beta, __half* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemm_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16965,11 +26857,36 @@ cublasStatus_t cublasHgemm_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyrk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -16987,11 +26904,33 @@ cublasStatus_t cublasSsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* beta, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyrk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17009,11 +26948,33 @@ cublasStatus_t cublasSsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* beta, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyrk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17031,11 +26992,33 @@ cublasStatus_t cublasDsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* beta, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyrk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17053,11 +27036,33 @@ cublasStatus_t cublasDsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyrk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17075,11 +27080,33 @@ cublasStatus_t cublasCsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyrk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17097,11 +27124,33 @@ cublasStatus_t cublasCsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyrk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17119,11 +27168,33 @@ cublasStatus_t cublasZsyrk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyrk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17141,11 +27212,33 @@ cublasStatus_t cublasZsyrk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const cuComplex* A, int lda, const float* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCherk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17163,11 +27256,33 @@ cublasStatus_t cublasCherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const cuComplex* A, int64_t lda, const float* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCherk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17185,11 +27300,33 @@ cublasStatus_t cublasCherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const cuDoubleComplex* A, int lda, const double* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZherk_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17207,11 +27344,33 @@ cublasStatus_t cublasZherk_v2(cublasHandle_t handle, cublasFillMode_t uplo, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const cuDoubleComplex* A, int64_t lda, const double* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZherk_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17229,11 +27388,35 @@ cublasStatus_t cublasZherk_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17253,11 +27436,37 @@ cublasStatus_t cublasSsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyr2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17277,11 +27486,37 @@ cublasStatus_t cublasSsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17301,11 +27536,37 @@ cublasStatus_t cublasDsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyr2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17325,11 +27586,37 @@ cublasStatus_t cublasDsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17349,11 +27636,37 @@ cublasStatus_t cublasCsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyr2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17373,11 +27686,37 @@ cublasStatus_t cublasCsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17397,11 +27736,37 @@ cublasStatus_t cublasZsyr2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyr2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17421,11 +27786,37 @@ cublasStatus_t cublasZsyr2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const float* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17445,11 +27836,37 @@ cublasStatus_t cublasCher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const float* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCher2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17469,11 +27886,37 @@ cublasStatus_t cublasCher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const double* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher2k_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17493,11 +27936,37 @@ cublasStatus_t cublasZher2k_v2(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const double* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZher2k_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17517,11 +27986,37 @@ cublasStatus_t cublasZher2k_v2_64(cublasHandle_t handle, cublasFillMode_t uplo, rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyrkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17541,11 +28036,37 @@ cublasStatus_t cublasSsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsyrkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17565,11 +28086,37 @@ cublasStatus_t cublasSsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyrkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17589,11 +28136,37 @@ cublasStatus_t cublasDsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsyrkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17613,11 +28186,37 @@ cublasStatus_t cublasDsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyrkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17637,11 +28236,37 @@ cublasStatus_t cublasCsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsyrkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17661,11 +28286,37 @@ cublasStatus_t cublasCsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyrkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17685,11 +28336,37 @@ cublasStatus_t cublasZsyrkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsyrkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17709,11 +28386,37 @@ cublasStatus_t cublasZsyrkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const float* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCherkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17733,11 +28436,37 @@ cublasStatus_t cublasCherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const float* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCherkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17757,11 +28486,37 @@ cublasStatus_t cublasCherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const double* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZherkx) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17781,11 +28536,37 @@ cublasStatus_t cublasZherkx(cublasHandle_t handle, cublasFillMode_t uplo, cublas rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const double* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZherkx_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17805,11 +28586,37 @@ cublasStatus_t cublasZherkx_64(cublasHandle_t handle, cublasFillMode_t uplo, cub rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsymm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17829,11 +28636,37 @@ cublasStatus_t cublasSsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, const float* beta, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSsymm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17853,11 +28686,37 @@ cublasStatus_t cublasSsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsymm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17877,11 +28736,37 @@ cublasStatus_t cublasDsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, const double* beta, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDsymm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17901,11 +28786,37 @@ cublasStatus_t cublasDsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsymm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17925,11 +28836,37 @@ cublasStatus_t cublasCsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCsymm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17949,11 +28886,37 @@ cublasStatus_t cublasCsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsymm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17973,11 +28936,37 @@ cublasStatus_t cublasZsymm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZsymm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -17997,11 +28986,37 @@ cublasStatus_t cublasZsymm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChemm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, const cuComplex* beta, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18021,11 +29036,37 @@ cublasStatus_t cublasChemm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasChemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, const cuComplex* beta, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasChemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18045,11 +29086,37 @@ cublasStatus_t cublasChemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhemm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhemm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18069,11 +29136,37 @@ cublasStatus_t cublasZhemm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZhemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZhemm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18093,11 +29186,36 @@ cublasStatus_t cublasZhemm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* A, int lda, float* B, int ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18116,11 +29234,35 @@ cublasStatus_t cublasStrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, B, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, float* B, int64_t ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18139,11 +29281,35 @@ cublasStatus_t cublasStrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, B, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* A, int lda, double* B, int ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18162,11 +29328,35 @@ cublasStatus_t cublasDtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, B, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, double* B, int64_t ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18185,11 +29375,35 @@ cublasStatus_t cublasDtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, B, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, cuComplex* B, int ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18208,11 +29422,35 @@ cublasStatus_t cublasCtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, B, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, cuComplex* B, int64_t ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18231,11 +29469,35 @@ cublasStatus_t cublasCtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, B, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, cuDoubleComplex* B, int ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18254,11 +29516,35 @@ cublasStatus_t cublasZtrsm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, B, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, cuDoubleComplex* B, int64_t ldb) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18277,11 +29563,37 @@ cublasStatus_t cublasZtrsm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, B, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* A, int lda, const float* B, int ldb, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrmm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18302,11 +29614,39 @@ cublasStatus_t cublasStrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* B, int64_t ldb, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrmm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18327,11 +29667,39 @@ cublasStatus_t cublasStrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* A, int lda, const double* B, int ldb, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrmm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18352,11 +29720,39 @@ cublasStatus_t cublasDtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* B, int64_t ldb, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrmm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18377,11 +29773,39 @@ cublasStatus_t cublasDtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* B, int ldb, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrmm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18402,11 +29826,39 @@ cublasStatus_t cublasCtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrmm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18427,11 +29879,39 @@ cublasStatus_t cublasCtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrmm_v2) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18452,11 +29932,39 @@ cublasStatus_t cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrmm_v2_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18477,11 +29985,46 @@ cublasStatus_t cublasZtrmm_v2_64(cublasHandle_t handle, cublasSideMode_t side, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* const Aarray[], int lda, const __half* const Barray[], int ldb, const __half* beta, __half* const Carray[], int ldc, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18502,11 +30045,53 @@ cublasStatus_t cublasHgemmBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* const Aarray[], int64_t lda, const __half* const Barray[], int64_t ldb, const __half* beta, __half* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18527,11 +30112,53 @@ cublasStatus_t cublasHgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* const Aarray[], int lda, const float* const Barray[], int ldb, const float* beta, float* const Carray[], int ldc, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18552,11 +30179,53 @@ cublasStatus_t cublasSgemmBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* const Aarray[], int64_t lda, const float* const Barray[], int64_t ldb, const float* beta, float* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18577,12 +30246,53 @@ cublasStatus_t cublasSgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* const Aarray[], int lda, const double* const Barray[], int ldb, const double* beta, double* const Carray[], int ldc, int batchCount) { - cuda_memcpy_unified_ptrs(0, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18605,12 +30315,53 @@ cublasStatus_t cublasDgemmBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; - cuda_memcpy_unified_ptrs(0, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* const Aarray[], int64_t lda, const double* const Barray[], int64_t ldb, const double* beta, double* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18633,11 +30384,53 @@ cublasStatus_t cublasDgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18660,11 +30453,53 @@ cublasStatus_t cublasCgemmBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18687,11 +30522,53 @@ cublasStatus_t cublasCgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const Barray[], int ldb, const cuComplex* beta, cuComplex* const Carray[], int ldc, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3mBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18714,11 +30591,53 @@ cublasStatus_t cublasCgemm3mBatched(cublasHandle_t handle, cublasOperation_t tra rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* const Aarray[], int64_t lda, const cuComplex* const Barray[], int64_t ldb, const cuComplex* beta, cuComplex* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3mBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18741,11 +30660,53 @@ cublasStatus_t cublasCgemm3mBatched_64(cublasHandle_t handle, cublasOperation_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const Barray[], int ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int ldc, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -18768,11 +30729,53 @@ cublasStatus_t cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t trans rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int64_t lda, const cuDoubleComplex* const Barray[], int64_t ldb, const cuDoubleComplex* beta, cuDoubleComplex* const Carray[], int64_t ldc, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -18795,11 +30798,50 @@ cublasStatus_t cublasZgemmBatched_64(cublasHandle_t handle, cublasOperation_t tr rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, long long int strideA, const __half* B, int ldb, long long int strideB, const __half* beta, __half* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18826,11 +30868,47 @@ cublasStatus_t cublasHgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const __half* alpha, const __half* A, int64_t lda, long long int strideA, const __half* B, int64_t ldb, long long int strideB, const __half* beta, __half* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasHgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18857,11 +30935,47 @@ cublasStatus_t cublasHgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, C, sizeof(__half)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, long long int strideA, const float* B, int ldb, long long int strideB, const float* beta, float* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18888,11 +31002,47 @@ cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* B, int64_t ldb, long long int strideB, const float* beta, float* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18919,11 +31069,47 @@ cublasStatus_t cublasSgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, long long int strideA, const double* B, int ldb, long long int strideB, const double* beta, double* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18950,11 +31136,47 @@ cublasStatus_t cublasDgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* B, int64_t ldb, long long int strideB, const double* beta, double* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -18981,11 +31203,47 @@ cublasStatus_t cublasDgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19012,11 +31270,47 @@ cublasStatus_t cublasCgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19043,11 +31337,47 @@ cublasStatus_t cublasCgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* B, int ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19074,11 +31404,47 @@ cublasStatus_t cublasCgemm3mStridedBatched(cublasHandle_t handle, cublasOperatio rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* B, int64_t ldb, long long int strideB, const cuComplex* beta, cuComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgemm3mStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19105,11 +31471,47 @@ cublasStatus_t cublasCgemm3mStridedBatched_64(cublasHandle_t handle, cublasOpera rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* B, int ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int ldc, long long int strideC, int batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemmStridedBatched) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19136,11 +31538,47 @@ cublasStatus_t cublasZgemmStridedBatched(cublasHandle_t handle, cublasOperation_ rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* B, int64_t ldb, long long int strideB, const cuDoubleComplex* beta, cuDoubleComplex* C, int64_t ldc, long long int strideC, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgemmStridedBatched_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19167,11 +31605,55 @@ cublasStatus_t cublasZgemmStridedBatched_64(cublasHandle_t handle, cublasOperati rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideA, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideB, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&strideC, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, const void* alpha, const void* const Aarray[], cudaDataType Atype, int64_t lda, const void* const Barray[], cudaDataType Btype, int64_t ldb, const void* beta, void* const Carray[], cudaDataType Ctype, int64_t ldc, int64_t batchCount, cublasComputeType_t computeType, cublasGemmAlgo_t algo) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Atype, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Btype, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&Ctype, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&computeType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasGemmBatchedEx_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -19199,11 +31681,50 @@ cublasStatus_t cublasGemmBatchedEx_64(cublasHandle_t handle, cublasOperation_t t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Atype, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Btype, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&Ctype, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&computeType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgeam) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19225,11 +31746,37 @@ cublasStatus_t cublasSgeam(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, const float* beta, const float* B, int64_t ldb, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgeam_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19251,11 +31798,37 @@ cublasStatus_t cublasSgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgeam) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19277,11 +31850,37 @@ cublasStatus_t cublasDgeam(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, const double* beta, const double* B, int64_t ldb, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgeam_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19303,11 +31902,37 @@ cublasStatus_t cublasDgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, const cuComplex* beta, const cuComplex* B, int ldb, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgeam) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19329,11 +31954,37 @@ cublasStatus_t cublasCgeam(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, const cuComplex* beta, const cuComplex* B, int64_t ldb, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgeam_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19355,11 +32006,37 @@ cublasStatus_t cublasCgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int ldb, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgeam) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19381,11 +32058,37 @@ cublasStatus_t cublasZgeam(cublasHandle_t handle, cublasOperation_t transa, cubl rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* beta, const cuDoubleComplex* B, int64_t ldb, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgeam_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19407,11 +32110,41 @@ cublasStatus_t cublasZgeam_64(cublasHandle_t handle, cublasOperation_t transa, c rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, const float* const A[], int lda, float* const B[], int ldb, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -19431,11 +32164,45 @@ cublasStatus_t cublasStrsmBatched(cublasHandle_t handle, cublasSideMode_t side, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const float* alpha, const float* const A[], int64_t lda, float* const B[], int64_t ldb, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrsmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -19455,11 +32222,45 @@ cublasStatus_t cublasStrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, const double* const A[], int lda, double* const B[], int ldb, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -19479,11 +32280,45 @@ cublasStatus_t cublasDtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const double* alpha, const double* const A[], int64_t lda, double* const B[], int64_t ldb, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrsmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -19503,11 +32338,45 @@ cublasStatus_t cublasDtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuComplex* alpha, const cuComplex* const A[], int lda, cuComplex* const B[], int ldb, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -19527,11 +32396,45 @@ cublasStatus_t cublasCtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* const A[], int64_t lda, cuComplex* const B[], int64_t ldb, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrsmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -19551,11 +32454,45 @@ cublasStatus_t cublasCtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const B[], int ldb, int batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsmBatched) < 0 || rpc_write(0, &batchCount, sizeof(int)) < 0 || @@ -19575,11 +32512,45 @@ cublasStatus_t cublasZtrsmBatched(cublasHandle_t handle, cublasSideMode_t side, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* const A[], int64_t lda, cuDoubleComplex* const B[], int64_t ldb, int64_t batchCount) { + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrsmBatched_64) < 0 || rpc_write(0, &batchCount, sizeof(int64_t)) < 0 || @@ -19599,11 +32570,38 @@ cublasStatus_t cublasZtrsmBatched_64(cublasHandle_t handle, cublasSideMode_t sid rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&side, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&diag, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchCount); i++) + maybe_copy_unified_arg(0, (void*)B[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const float* A, int lda, const float* x, int incx, float* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSdgmm) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19621,11 +32619,31 @@ cublasStatus_t cublasSdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const float* A, int64_t lda, const float* x, int64_t incx, float* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSdgmm_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19643,11 +32661,31 @@ cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_read(0, C, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const double* A, int lda, const double* x, int incx, double* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDdgmm) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19665,11 +32703,31 @@ cublasStatus_t cublasDdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const double* A, int64_t lda, const double* x, int64_t incx, double* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDdgmm_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19687,11 +32745,31 @@ cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_read(0, C, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuComplex* A, int lda, const cuComplex* x, int incx, cuComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdgmm) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19709,11 +32787,31 @@ cublasStatus_t cublasCdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, cuComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCdgmm_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19731,11 +32829,31 @@ cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_read(0, C, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const cuDoubleComplex* A, int lda, const cuDoubleComplex* x, int incx, cuDoubleComplex* C, int ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdgmm) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19753,11 +32871,31 @@ cublasStatus_t cublasZdgmm(cublasHandle_t handle, cublasSideMode_t mode, int m, rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZdgmm_64) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -19775,11 +32913,33 @@ cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int6 rpc_read(0, C, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&incx, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* const A[], int lda, float* const Ainv[], int lda_inv, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSmatinvBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19794,11 +32954,35 @@ cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float* c rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* const A[], int lda, double* const Ainv[], int lda_inv, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDmatinvBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19813,11 +32997,35 @@ cublasStatus_t cublasDmatinvBatched(cublasHandle_t handle, int n, const double* rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, cuComplex* const Ainv[], int lda_inv, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCmatinvBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19832,11 +33040,35 @@ cublasStatus_t cublasCmatinvBatched(cublasHandle_t handle, int n, const cuComple rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, cuDoubleComplex* const Ainv[], int lda_inv, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZmatinvBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19851,11 +33083,35 @@ cublasStatus_t cublasZmatinvBatched(cublasHandle_t handle, int n, const cuDouble rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Ainv, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Ainv[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda_inv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* const Aarray[], int lda, float* const TauArray[], int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgeqrfBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19870,11 +33126,35 @@ cublasStatus_t cublasSgeqrfBatched(cublasHandle_t handle, int m, int n, float* c rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* const Aarray[], int lda, double* const TauArray[], int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgeqrfBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19889,11 +33169,35 @@ cublasStatus_t cublasDgeqrfBatched(cublasHandle_t handle, int m, int n, double* rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComplex* const Aarray[], int lda, cuComplex* const TauArray[], int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgeqrfBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19908,11 +33212,35 @@ cublasStatus_t cublasCgeqrfBatched(cublasHandle_t handle, int m, int n, cuComple rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const TauArray[], int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgeqrfBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19927,11 +33255,39 @@ cublasStatus_t cublasZgeqrfBatched(cublasHandle_t handle, int m, int n, cuDouble rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)TauArray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)TauArray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, float* const Aarray[], int lda, float* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgelsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19951,11 +33307,43 @@ cublasStatus_t cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans rpc_read(0, devInfoArray, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, double* const Aarray[], int lda, double* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgelsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19975,11 +33363,43 @@ cublasStatus_t cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans rpc_read(0, devInfoArray, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuComplex* const Aarray[], int lda, cuComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgelsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -19999,11 +33419,43 @@ cublasStatus_t cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans rpc_read(0, devInfoArray, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, cuDoubleComplex* const Aarray[], int lda, cuDoubleComplex* const Carray[], int ldc, int* info, int* devInfoArray, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgelsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20023,11 +33475,33 @@ cublasStatus_t cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans rpc_read(0, devInfoArray, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Carray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Carray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devInfoArray, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStpttr) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20041,11 +33515,23 @@ cublasStatus_t cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, A, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtpttr) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20059,11 +33545,23 @@ cublasStatus_t cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, A, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtpttr) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20077,11 +33575,23 @@ cublasStatus_t cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, A, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtpttr) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20095,11 +33605,23 @@ cublasStatus_t cublasZtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, A, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasStrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20113,11 +33635,23 @@ cublasStatus_t cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, AP, sizeof(float)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20131,11 +33665,23 @@ cublasStatus_t cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, AP, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20149,11 +33695,23 @@ cublasStatus_t cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, AP, sizeof(cuComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZtrttp) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20167,11 +33725,30 @@ cublasStatus_t cublasZtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, rpc_read(0, AP, sizeof(cuDoubleComplex)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&uplo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)AP, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* const A[], int lda, const int* P, float* const C[], int ldc, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgetriBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20188,11 +33765,37 @@ cublasStatus_t cublasSgetriBatched(cublasHandle_t handle, int n, const float* co rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* const A[], int lda, const int* P, double* const C[], int ldc, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgetriBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20209,11 +33812,37 @@ cublasStatus_t cublasDgetriBatched(cublasHandle_t handle, int n, const double* c rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex* const A[], int lda, const int* P, cuComplex* const C[], int ldc, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgetriBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20230,11 +33859,37 @@ cublasStatus_t cublasCgetriBatched(cublasHandle_t handle, int n, const cuComplex rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleComplex* const A[], int lda, const int* P, cuDoubleComplex* const C[], int ldc, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgetriBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20251,11 +33906,39 @@ cublasStatus_t cublasZgetriBatched(cublasHandle_t handle, int n, const cuDoubleC rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)A[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)P, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)C[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float* const Aarray[], int lda, const int* devIpiv, float* const Barray[], int ldb, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasSgetrsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20274,11 +33957,41 @@ cublasStatus_t cublasSgetrsBatched(cublasHandle_t handle, cublasOperation_t tran rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const double* const Aarray[], int lda, const int* devIpiv, double* const Barray[], int ldb, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasDgetrsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20297,11 +34010,41 @@ cublasStatus_t cublasDgetrsBatched(cublasHandle_t handle, cublasOperation_t tran rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuComplex* const Aarray[], int lda, const int* devIpiv, cuComplex* const Barray[], int ldb, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasCgetrsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20319,11 +34062,41 @@ cublasStatus_t cublasCgetrsBatched(cublasHandle_t handle, cublasOperation_t tran rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs, const cuDoubleComplex* const Aarray[], int lda, const int* devIpiv, cuDoubleComplex* const Barray[], int ldb, int* info, int batchSize) { + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyHostToDevice); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasZgetrsBatched) < 0 || rpc_write(0, &batchSize, sizeof(int)) < 0 || @@ -20342,11 +34115,44 @@ cublasStatus_t cublasZgetrsBatched(cublasHandle_t handle, cublasOperation_t tran rpc_read(0, info, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&batchSize, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&trans, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nrhs, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Aarray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Aarray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)devIpiv, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)Barray, cudaMemcpyDeviceToHost); + for (int i = 0; i < static_cast(batchSize); i++) + maybe_copy_unified_arg(0, (void*)Barray[i], cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)info, cudaMemcpyDeviceToHost); return return_value; } cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, int m, int n, int k, const unsigned char* A, int A_bias, int lda, const unsigned char* B, int B_bias, int ldb, unsigned char* C, int C_bias, int ldc, int C_mult, int C_shift) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&transc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&A_bias, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&B_bias, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&C_bias, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&C_mult, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&C_shift, cudaMemcpyHostToDevice); cublasStatus_t return_value; if (rpc_start_request(0, RPC_cublasUint8gemmBias) < 0 || rpc_write(0, &handle, sizeof(cublasHandle_t)) < 0 || @@ -20371,11 +34177,31 @@ cublasStatus_t cublasUint8gemmBias(cublasHandle_t handle, cublasOperation_t tran rpc_read(0, C, sizeof(unsigned char)) < 0 || rpc_end_response(0, &return_value) < 0) return CUBLAS_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transa, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&transc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&m, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)A, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&A_bias, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lda, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)B, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&B_bias, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldb, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)C, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&C_bias, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&ldc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&C_mult, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&C_shift, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int* value) { + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetProperty) < 0 || rpc_write(0, &type, sizeof(libraryPropertyType)) < 0 || @@ -20384,33 +34210,41 @@ cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int* value) rpc_read(0, value, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&type, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)value, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreate(cudnnHandle_t* handle) { + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreate) < 0 || rpc_wait_for_response(0) < 0 || rpc_read(0, handle, sizeof(cudnnHandle_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)handle, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroy(cudnnHandle_t handle) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroy) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetStream) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20418,11 +34252,15 @@ cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&streamId, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t* streamId) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetStream) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20431,11 +34269,16 @@ cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t* streamId) rpc_read(0, streamId, sizeof(cudaStream_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)streamId, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetCallback(unsigned* mask, void** udata, cudnnCallback_t* fptr) { + maybe_copy_unified_arg(0, (void*)mask, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)udata, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetCallback) < 0 || rpc_write(0, mask, sizeof(unsigned)) < 0 || @@ -20447,6 +34290,9 @@ cudnnStatus_t cudnnGetCallback(unsigned* mask, void** udata, cudnnCallback_t* fp rpc_read(0, fptr, sizeof(cudnnCallback_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)mask, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)udata, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)fptr, cudaMemcpyDeviceToHost); return return_value; } @@ -20462,6 +34308,8 @@ cudnnStatus_t cudnnGraphVersionCheck() cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t* descriptor) { + maybe_copy_unified_arg(0, (void*)&descriptorType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)descriptor, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendCreateDescriptor) < 0 || rpc_write(0, &descriptorType, sizeof(cudnnBackendDescriptorType_t)) < 0 || @@ -20470,44 +34318,57 @@ cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descript rpc_read(0, descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&descriptorType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)descriptor, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor) { + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendDestroyDescriptor) < 0 || rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor) { + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendInitialize) < 0 || rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor) { + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendFinalize) < 0 || rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t elementCount, const void* arrayOfElements) { + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attributeName, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&attributeType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&elementCount, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)arrayOfElements, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendSetAttribute) < 0 || rpc_write(0, &descriptor, sizeof(cudnnBackendDescriptor_t)) < 0 || @@ -20518,11 +34379,19 @@ cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, cudn rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&descriptor, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attributeName, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&attributeType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&elementCount, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)arrayOfElements, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendExecute) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20531,11 +34400,18 @@ cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t graph) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendPopulateCudaGraph) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20545,11 +34421,19 @@ cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDe rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t graph) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnBackendUpdateCudaGraph) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20559,11 +34443,16 @@ cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDesc rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&executionPlan, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&variantPack, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&graph, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t* tensorDesc) { + maybe_copy_unified_arg(0, (void*)tensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateTensorDescriptor) < 0 || rpc_write(0, tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -20571,11 +34460,19 @@ cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t* tensorDesc) rpc_read(0, tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)tensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w) { + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetTensor4dDescriptor) < 0 || rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -20588,11 +34485,28 @@ cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cud rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) { + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&nStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&hStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&wStride, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetTensor4dDescriptorEx) < 0 || rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -20608,11 +34522,31 @@ cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, c rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&nStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&hStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&wStride, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t* dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride) { + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)nStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)cStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)hStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)wStride, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetTensor4dDescriptor) < 0 || rpc_write(0, &tensorDesc, sizeof(const cudnnTensorDescriptor_t)) < 0 || @@ -20637,11 +34571,23 @@ cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDes rpc_read(0, wStride, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)nStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)cStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)hStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)wStride, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t* size) { + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetTensorSizeInBytes) < 0 || rpc_write(0, &tensorDesc, sizeof(const cudnnTensorDescriptor_t)) < 0 || @@ -20650,22 +34596,30 @@ cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc rpc_read(0, size, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) { + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyTensorDescriptor) < 0 || rpc_write(0, &tensorDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&tensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc, size_t* destSizeInBytes) { + maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&srcDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&destDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)destSizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnInitTransformDest) < 0 || rpc_write(0, &transformDesc, sizeof(const cudnnTensorTransformDescriptor_t)) < 0 || @@ -20676,11 +34630,16 @@ cudnnStatus_t cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t tran rpc_read(0, destSizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&srcDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&destDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)destSizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t* transformDesc) { + maybe_copy_unified_arg(0, (void*)transformDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateTensorTransformDescriptor) < 0 || rpc_write(0, transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 || @@ -20688,22 +34647,26 @@ cudnnStatus_t cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescripto rpc_read(0, transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)transformDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) { + maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyTensorTransformDescriptor) < 0 || rpc_write(0, &transformDesc, sizeof(cudnnTensorTransformDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&transformDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t* opTensorDesc) { + maybe_copy_unified_arg(0, (void*)opTensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateOpTensorDescriptor) < 0 || rpc_write(0, opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 || @@ -20711,11 +34674,16 @@ cudnnStatus_t cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t* opTensorD rpc_read(0, opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)opTensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp, cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) { + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&opTensorOp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&opTensorCompType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&opTensorNanOpt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetOpTensorDescriptor) < 0 || rpc_write(0, &opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 || @@ -20725,11 +34693,19 @@ cudnnStatus_t cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&opTensorOp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&opTensorCompType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&opTensorNanOpt, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t* opTensorOp, cudnnDataType_t* opTensorCompType, cudnnNanPropagation_t* opTensorNanOpt) { + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)opTensorOp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)opTensorCompType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)opTensorNanOpt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetOpTensorDescriptor) < 0 || rpc_write(0, &opTensorDesc, sizeof(const cudnnOpTensorDescriptor_t)) < 0 || @@ -20742,22 +34718,29 @@ cudnnStatus_t cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTenso rpc_read(0, opTensorNanOpt, sizeof(cudnnNanPropagation_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)opTensorOp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)opTensorCompType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)opTensorNanOpt, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) { + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyOpTensorDescriptor) < 0 || rpc_write(0, &opTensorDesc, sizeof(cudnnOpTensorDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&opTensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t* reduceTensorDesc) { + maybe_copy_unified_arg(0, (void*)reduceTensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateReduceTensorDescriptor) < 0 || rpc_write(0, reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 || @@ -20765,11 +34748,18 @@ cudnnStatus_t cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t* r rpc_read(0, reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)reduceTensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType, cudnnNanPropagation_t reduceTensorNanOpt, cudnnReduceTensorIndices_t reduceTensorIndices, cudnnIndicesType_t reduceTensorIndicesType) { + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorOp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorCompType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorIndices, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorIndicesType, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetReduceTensorDescriptor) < 0 || rpc_write(0, &reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 || @@ -20781,11 +34771,23 @@ cudnnStatus_t cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduc rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorOp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorCompType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorIndices, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorIndicesType, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, cudnnReduceTensorOp_t* reduceTensorOp, cudnnDataType_t* reduceTensorCompType, cudnnNanPropagation_t* reduceTensorNanOpt, cudnnReduceTensorIndices_t* reduceTensorIndices, cudnnIndicesType_t* reduceTensorIndicesType) { + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reduceTensorOp, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reduceTensorCompType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reduceTensorNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reduceTensorIndices, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reduceTensorIndicesType, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetReduceTensorDescriptor) < 0 || rpc_write(0, &reduceTensorDesc, sizeof(const cudnnReduceTensorDescriptor_t)) < 0 || @@ -20802,22 +34804,35 @@ cudnnStatus_t cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t rpc_read(0, reduceTensorIndicesType, sizeof(cudnnIndicesType_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reduceTensorOp, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reduceTensorCompType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reduceTensorNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reduceTensorIndices, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reduceTensorIndicesType, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) { + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyReduceTensorDescriptor) < 0 || rpc_write(0, &reduceTensorDesc, sizeof(cudnnReduceTensorDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetReductionIndicesSize(cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc, const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetReductionIndicesSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20829,11 +34844,21 @@ cudnnStatus_t cudnnGetReductionIndicesSize(cudnnHandle_t handle, const cudnnRedu rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc, const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetReductionWorkspaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -20845,11 +34870,17 @@ cudnnStatus_t cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, const cudnnRe rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reduceTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&aDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&cDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t* filterDesc) { + maybe_copy_unified_arg(0, (void*)filterDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateFilterDescriptor) < 0 || rpc_write(0, filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 || @@ -20857,11 +34888,19 @@ cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t* filterDesc) rpc_read(0, filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)filterDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w) { + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetFilter4dDescriptor) < 0 || rpc_write(0, &filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 || @@ -20874,11 +34913,25 @@ cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cud rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dataType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&format, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&w, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t* dataType, cudnnTensorFormat_t* format, int* k, int* c, int* h, int* w) { + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)format, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)k, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetFilter4dDescriptor) < 0 || rpc_write(0, &filterDesc, sizeof(const cudnnFilterDescriptor_t)) < 0 || @@ -20897,11 +34950,20 @@ cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDes rpc_read(0, w, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dataType, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)format, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)k, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size) { + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetFilterSizeInBytes) < 0 || rpc_write(0, &filterDesc, sizeof(const cudnnFilterDescriptor_t)) < 0 || @@ -20910,22 +34972,27 @@ cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc rpc_read(0, size, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)size, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) { + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyFilterDescriptor) < 0 || rpc_write(0, &filterDesc, sizeof(cudnnFilterDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&filterDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t* poolingDesc) { + maybe_copy_unified_arg(0, (void*)poolingDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreatePoolingDescriptor) < 0 || rpc_write(0, poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 || @@ -20933,11 +35000,21 @@ cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t* poolingDesc rpc_read(0, poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)poolingDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) { + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&maxpoolingNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&windowHeight, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&windowWidth, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&verticalPadding, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&horizontalPadding, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&verticalStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&horizontalStride, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetPooling2dDescriptor) < 0 || rpc_write(0, &poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 || @@ -20952,11 +35029,29 @@ cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&maxpoolingNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&windowHeight, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&windowWidth, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&verticalPadding, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&horizontalPadding, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&verticalStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&horizontalStride, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t* mode, cudnnNanPropagation_t* maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride) { + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)maxpoolingNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)windowHeight, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)windowWidth, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)verticalPadding, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)horizontalPadding, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)verticalStride, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)horizontalStride, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetPooling2dDescriptor) < 0 || rpc_write(0, &poolingDesc, sizeof(const cudnnPoolingDescriptor_t)) < 0 || @@ -20979,11 +35074,26 @@ cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t pooling rpc_read(0, horizontalStride, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)maxpoolingNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)windowHeight, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)windowWidth, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)verticalPadding, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)horizontalPadding, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)verticalStride, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)horizontalStride, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w) { + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&inputTensorDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)n, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetPooling2dForwardOutputDim) < 0 || rpc_write(0, &poolingDesc, sizeof(const cudnnPoolingDescriptor_t)) < 0 || @@ -20999,22 +35109,31 @@ cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t p rpc_read(0, w, sizeof(int)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&inputTensorDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)n, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)c, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)h, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)w, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) { + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyPoolingDescriptor) < 0 || rpc_write(0, &poolingDesc, sizeof(cudnnPoolingDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&poolingDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t* activationDesc) { + maybe_copy_unified_arg(0, (void*)activationDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateActivationDescriptor) < 0 || rpc_write(0, activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || @@ -21022,11 +35141,16 @@ cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t* activ rpc_read(0, activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)activationDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) { + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&reluNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&coef, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetActivationDescriptor) < 0 || rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || @@ -21036,11 +35160,19 @@ cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activatio rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&reluNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&coef, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t* mode, cudnnNanPropagation_t* reluNanOpt, double* coef) { + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)reluNanOpt, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)coef, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetActivationDescriptor) < 0 || rpc_write(0, &activationDesc, sizeof(const cudnnActivationDescriptor_t)) < 0 || @@ -21053,11 +35185,17 @@ cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t act rpc_read(0, coef, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)reluNanOpt, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)coef, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta) { + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&swish_beta, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetActivationDescriptorSwishBeta) < 0 || rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || @@ -21065,11 +35203,15 @@ cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&swish_beta, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double* swish_beta) { + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)swish_beta, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetActivationDescriptorSwishBeta) < 0 || rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || @@ -21078,22 +35220,34 @@ cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t rpc_read(0, swish_beta, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)swish_beta, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) { + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyActivationDescriptor) < 0 || rpc_write(0, &activationDesc, sizeof(cudnnActivationDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void* alpha, const cudnnTensorDescriptor_t xDesc, const void* x, const void* beta, const cudnnTensorDescriptor_t yDesc, void* y) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnActivationForward) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21109,11 +35263,20 @@ cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescri rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)alpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)x, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)beta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)y, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t* normDesc) { + maybe_copy_unified_arg(0, (void*)normDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateLRNDescriptor) < 0 || rpc_write(0, normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 || @@ -21121,11 +35284,17 @@ cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t* normDesc) rpc_read(0, normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)normDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) { + maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lrnN, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lrnAlpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lrnBeta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&lrnK, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnSetLRNDescriptor) < 0 || rpc_write(0, &normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 || @@ -21136,11 +35305,21 @@ cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lrnN, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lrnAlpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lrnBeta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&lrnK, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned* lrnN, double* lrnAlpha, double* lrnBeta, double* lrnK) { + maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)lrnN, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)lrnAlpha, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)lrnBeta, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)lrnK, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetLRNDescriptor) < 0 || rpc_write(0, &normDesc, sizeof(cudnnLRNDescriptor_t)) < 0 || @@ -21155,22 +35334,32 @@ cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned* lrn rpc_read(0, lrnK, sizeof(double)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&normDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)lrnN, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)lrnAlpha, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)lrnBeta, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)lrnK, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) { + maybe_copy_unified_arg(0, (void*)&lrnDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyLRNDescriptor) < 0 || rpc_write(0, &lrnDesc, sizeof(cudnnLRNDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&lrnDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc, cudnnBatchNormMode_t mode) { + maybe_copy_unified_arg(0, (void*)&derivedBnDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDeriveBNTensorDescriptor) < 0 || rpc_write(0, &derivedBnDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -21179,11 +35368,19 @@ cudnnStatus_t cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDes rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&derivedBnDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, cudnnTensorDescriptor_t derivedNormMeanVarDesc, const cudnnTensorDescriptor_t xDesc, cudnnNormMode_t mode, int groupCnt) { + maybe_copy_unified_arg(0, (void*)&derivedNormScaleBiasDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&derivedNormMeanVarDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDeriveNormTensorDescriptor) < 0 || rpc_write(0, &derivedNormScaleBiasDesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -21194,11 +35391,17 @@ cudnnStatus_t cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNor rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&derivedNormScaleBiasDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&derivedNormMeanVarDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t* stDesc) { + maybe_copy_unified_arg(0, (void*)stDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateSpatialTransformerDescriptor) < 0 || rpc_write(0, stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 || @@ -21206,22 +35409,26 @@ cudnnStatus_t cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDes rpc_read(0, stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)stDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) { + maybe_copy_unified_arg(0, (void*)&stDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroySpatialTransformerDescriptor) < 0 || rpc_write(0, &stDesc, sizeof(cudnnSpatialTransformerDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&stDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t* dropoutDesc) { + maybe_copy_unified_arg(0, (void*)dropoutDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnCreateDropoutDescriptor) < 0 || rpc_write(0, dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 || @@ -21229,22 +35436,27 @@ cudnnStatus_t cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t* dropoutDesc rpc_read(0, dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)dropoutDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) { + maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDestroyDropoutDescriptor) < 0 || rpc_write(0, &dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 || rpc_wait_for_response(0) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDropoutGetStatesSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21253,11 +35465,15 @@ cudnnStatus_t cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t* sizeInByte rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&xdesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnDropoutGetReserveSpaceSize) < 0 || rpc_write(0, &xdesc, sizeof(cudnnTensorDescriptor_t)) < 0 || @@ -21266,11 +35482,18 @@ cudnnStatus_t cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, siz rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&xdesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float* dropout, void** states, unsigned long long* seed) { + maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)dropout, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)states, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)seed, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetDropoutDescriptor) < 0 || rpc_write(0, &dropoutDesc, sizeof(cudnnDropoutDescriptor_t)) < 0 || @@ -21284,6 +35507,11 @@ cudnnStatus_t cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, cu rpc_read(0, seed, sizeof(unsigned long long)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&dropoutDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)dropout, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)states, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)seed, cudaMemcpyDeviceToHost); return return_value; } @@ -21299,6 +35527,15 @@ cudnnStatus_t cudnnOpsVersionCheck() cudnnStatus_t cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const cudnnActivationDescriptor_t activationDesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bnScaleBiasMeanVarDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21314,11 +35551,31 @@ cudnnStatus_t cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHand rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bnScaleBiasMeanVarDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc, const cudnnTensorDescriptor_t dxDesc, const cudnnTensorDescriptor_t dBnScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dBnScaleBiasDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationBackwardExWorkspaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21336,11 +35593,28 @@ cudnnStatus_t cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t ha rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dBnScaleBiasDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t xDesc, size_t* sizeInBytes) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetBatchNormalizationTrainingExReserveSpaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21353,11 +35627,29 @@ cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&bnOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t normScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t normMeanVarDesc, size_t* sizeInBytes, int groupCnt) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normScaleBiasDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetNormalizationForwardTrainingWorkspaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21376,11 +35668,37 @@ cudnnStatus_t cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t ha rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&zDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normScaleBiasDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc, const cudnnTensorDescriptor_t dxDesc, const cudnnTensorDescriptor_t dNormScaleBiasDesc, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t normMeanVarDesc, size_t* sizeInBytes, int groupCnt) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&dNormScaleBiasDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetNormalizationBackwardWorkspaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21401,11 +35719,33 @@ cudnnStatus_t cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, c rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&yDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dyDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dzDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dxDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&dNormScaleBiasDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normMeanVarDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost); return return_value; } cudnnStatus_t cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle, cudnnNormMode_t mode, cudnnNormOps_t normOps, cudnnNormAlgo_t algo, const cudnnActivationDescriptor_t activationDesc, const cudnnTensorDescriptor_t xDesc, size_t* sizeInBytes, int groupCnt) { + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyHostToDevice); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyHostToDevice); cudnnStatus_t return_value; if (rpc_start_request(0, RPC_cudnnGetNormalizationTrainingReserveSpaceSize) < 0 || rpc_write(0, &handle, sizeof(cudnnHandle_t)) < 0 || @@ -21420,6 +35760,14 @@ cudnnStatus_t cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle rpc_read(0, sizeInBytes, sizeof(size_t)) < 0 || rpc_end_response(0, &return_value) < 0) return CUDNN_STATUS_NOT_INITIALIZED; + maybe_copy_unified_arg(0, (void*)&handle, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&mode, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&normOps, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&algo, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&activationDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&xDesc, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)sizeInBytes, cudaMemcpyDeviceToHost); + maybe_copy_unified_arg(0, (void*)&groupCnt, cudaMemcpyDeviceToHost); return return_value; } diff --git a/local.sh b/local.sh index 3df7c12..7758b54 100755 --- a/local.sh +++ b/local.sh @@ -234,6 +234,7 @@ build_tests() { nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_pointer.cu -o unified_pointer.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/unified_linked.cu -o unified_linked.o nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cublas_unified.cu -o cublas_unified.o + nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn -lcublas ./test/cudnn_managed.cu -o cudnn_managed.o } run() { diff --git a/test/cudnn_managed.cu b/test/cudnn_managed.cu new file mode 100644 index 0000000..5cf6980 --- /dev/null +++ b/test/cudnn_managed.cu @@ -0,0 +1,83 @@ +#include +#include +#include + +/** + * Minimal example to apply sigmoid activation on a tensor + * using cuDNN with Unified Memory. + **/ +int main(int argc, char** argv) +{ + int numGPUs; + cudaGetDeviceCount(&numGPUs); + std::cout << "Found " << numGPUs << " GPUs." << std::endl; + cudaSetDevice(0); // use GPU0 + int device; + struct cudaDeviceProp devProp; + cudaGetDevice(&device); + cudaGetDeviceProperties(&devProp, device); + std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl; + + cudnnHandle_t handle_; + cudnnCreate(&handle_); + std::cout << "Created cuDNN handle" << std::endl; + + // Create the tensor descriptor + cudnnDataType_t dtype = CUDNN_DATA_FLOAT; + cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW; + + int n = 1, c = 1, h = 1, w = 10; + int NUM_ELEMENTS = n * c * h * w; + cudnnTensorDescriptor_t x_desc; + + cudnnCreateTensorDescriptor(&x_desc); + cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w); + + // Allocate unified memory for the tensor + float *x; + cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float)); + + // Initialize the tensor + for (int i = 0; i < NUM_ELEMENTS; i++) x[i] = i * 1.00f; + + std::cout << "Original array: "; + for (int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " "; + std::cout << std::endl; + + // Synchronize to ensure data is accessible on the device + cudaDeviceSynchronize(); + + // Create activation function descriptor + float alpha[1] = {1}; + float beta[1] = {0.0}; + cudnnActivationDescriptor_t sigmoid_activation; + cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID; + cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN; + cudnnCreateActivationDescriptor(&sigmoid_activation); + cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f); + + cudnnActivationForward( + handle_, + sigmoid_activation, + alpha, + x_desc, + x, + beta, + x_desc, + x + ); + + // Synchronize to ensure data is updated on the host + cudaDeviceSynchronize(); + + cudnnDestroy(handle_); + std::cout << "Destroyed cuDNN handle." << std::endl; + + std::cout << "New array: "; + for (int i = 0; i < NUM_ELEMENTS; i++) std::cout << x[i] << " "; + std::cout << std::endl; + + cudaFree(x); + + return 0; +}